Merge pull request #13 from PaddlePaddle/develop

merge to local

Merge pull request #13 from PaddlePaddle/develop
merge to local
86511f51 · lujun · GitHub · 5bb04ea4 · ec11135d · 86511f51
123 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -56,7 +56,7 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr
 paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
 paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
-paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c'))
+paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
 paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
 paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -109,7 +109,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
-paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f'))
+paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a'))
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
@@ -205,7 +205,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None,
 paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
-paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7'))
+paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
@@ -296,7 +296,7 @@ paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
 paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
-paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
+paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
 paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
+cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)

 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

@@ -22,6 +23,8 @@ endif()
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
+    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
    if(WITH_DISTRIBUTE)
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@@ -35,6 +38,8 @@ if(WITH_GPU)
 else()
    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             variable_visitor)
+    cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            variable_visitor)
    if(WITH_DISTRIBUTE)
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim selected_rows_functor sendrecvop_rpc)
@@ -46,9 +51,7 @@ else()
    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()

-cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

 if(WITH_GPU)
 cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
@@ -69,7 +72,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)

 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+
+cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

 set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
@@ -99,4 +104,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass 
        fuse_relu_depthwise_conv_pass 
-        memory_optimize_pass lock_free_optimize_pass)
+        memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -11,9 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());

  WaitInputVarGenerated();
+
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
+              "fuse_parameter_memory_size is up limited memory size "
+              "of one group parameters' gradient which is the input "
+              "of communication calling(e.g NCCLAllReduce). "
+              "The default value is 0, it means that "
+              "not set group according to memory_size.");
+DEFINE_int32(
+    fuse_parameter_groups_size, 3,
+    "fuse_parameter_groups_size is the size of one group parameters' gradient. "
+    "The default value is a experimental result. If the "
+    "fuse_parameter_groups_size is 1, it means that the groups size is "
+    "the number of parameters' gradient. If the fuse_parameter_groups_size is "
+    "-1, it means that there are only one group. The default value is 3, it is "
+    "an experimental value.");
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static const char kUnKnow[] = "@UNKNOW@";
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
+
+class AllocContinuousSpaceForGradPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    ir::Graph &result = *graph;
+
+    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+
+    ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
+    ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
+
+    // NOTE: The operator nodes should be in topology order.
+    std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    for (auto &node : topo_nodes) {
+      RecordParamsAndGrads(node, &params_grads);
+    }
+
+    if (params_grads.size() == 0) {
+      VLOG(10) << "Doesn't find gradients";
+      return std::move(graph);
+    }
+
+    std::unordered_map<std::string, ir::Node *> vars;
+    for (ir::Node *node : result.Nodes()) {
+      if (node->IsVar() && node->Var()) {
+        // Note: The graph may have the same name node. For example, parameter
+        // is the input of operator and it also is the output of optimizer;
+        vars.emplace(node->Var()->Name(), node);
+      }
+    }
+
+    auto &group_grads_params =
+        result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
+
+    // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
+    SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
+
+    params_grads.clear();
+    for (auto &group_p_g : group_grads_params) {
+      params_grads.insert(params_grads.begin(), group_p_g.begin(),
+                          group_p_g.end());
+    }
+    for (auto &p_g : params_grads) {
+      std::swap(p_g.first, p_g.second);
+    }
+
+    // Set Gradients as Persistable to prevent this var becoming reusable.
+    auto dtype = kDefaultDtype;
+    for (auto &p_g : params_grads) {
+      // Get gradient var
+      auto iter = vars.find(p_g.second);
+      PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
+      iter->second->Var()->SetPersistable(true);
+
+      PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+
+      // Get Dtype
+      auto ele_dtype = iter->second->Var()->GetDataType();
+      if (dtype == kDefaultDtype) {
+        dtype = ele_dtype;
+        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
+      }
+      PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+    }
+
+    // Create the fused variable name.
+    if (!result.Has(kFusedVars)) {
+      result.Set(kFusedVars, new FusedVars);
+    }
+    const std::string prefix(kFusedVarNamePrefix);
+    // The fused_var_name should be unique.
+    auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
+    auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    fused_var_set.insert(fused_var_name);
+
+    InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
+                                      fused_var_name, params_grads);
+
+    return std::move(graph);
+  }
+
+  template <typename AttrType>
+  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
+    if (graph->Has(attr_name)) {
+      VLOG(10) << attr_name << " is reset.";
+      graph->Erase(attr_name);
+    }
+    graph->Set(attr_name, new AttrType);
+  }
+
+  void SetGroupGradsAndParams(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const {
+    SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
+    SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
+    SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+  }
+
+  void SetGroupAccordingToLayers(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const {
+    std::unordered_map<std::string, std::vector<int>> layer_params;
+
+    for (size_t i = 0; i < params_grads.size(); ++i) {
+      auto pos = params_grads[i].first.find_first_of(".");
+      if (pos == std::string::npos) {
+        layer_params[std::string(kUnKnow)].emplace_back(i);
+      } else {
+        layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
+      }
+    }
+
+    group_grads_params->reserve(layer_params.size());
+    for (size_t i = 0; i < params_grads.size(); ++i) {
+      auto pos = params_grads[i].first.find_first_of(".");
+      std::string key = kUnKnow;
+      if (pos != std::string::npos) {
+        key = params_grads[i].first.substr(0, pos);
+      }
+      auto iter = layer_params.find(key);
+      if (iter == layer_params.end()) continue;
+
+      group_grads_params->emplace_back();
+      auto &local_group_grads_params = group_grads_params->back();
+      for (auto &idx : iter->second) {
+        local_group_grads_params.emplace_back(
+            std::make_pair(params_grads[idx].second, params_grads[idx].first));
+      }
+      layer_params.erase(iter);
+    }
+
+    VLOG(10) << "SetGroupAccordingToLayers: ";
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+  void SetGroupAccordingToMemorySize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const {
+    if (FLAGS_fuse_parameter_memory_size == 0) {
+      return;
+    }
+    size_t group_memory_size =
+        static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
+    GroupGradsAndParams local_group_grads_params;
+
+    size_t j = 0;
+    while (j < group_grads_params->size()) {
+      local_group_grads_params.emplace_back();
+      auto &group_p_g = local_group_grads_params.back();
+      size_t local_group_memory_size = 0;
+      while (j < group_grads_params->size()) {
+        std::for_each(
+            group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
+            [&local_group_memory_size,
+             &var_nodes](const std::pair<std::string, std::string> &g_p) {
+              auto iter = var_nodes.find(g_p.second);
+              PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
+                             g_p.second);
+              auto shape = iter->second->Var()->GetShape();
+              size_t size =
+                  framework::SizeOfType(iter->second->Var()->GetDataType());
+              std::for_each(shape.begin(), shape.end(),
+                            [&size](const int64_t &n) { size *= n; });
+              local_group_memory_size += size;
+            });
+        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                         group_grads_params->at(j).end());
+        ++j;
+        if (local_group_memory_size >= group_memory_size) {
+          break;
+        }
+      }
+    }
+
+    std::swap(*group_grads_params, local_group_grads_params);
+
+    VLOG(10) << string::Sprintf(
+        "SetGroupAccordingToMemorySize(memory_size: %d):",
+        FLAGS_fuse_parameter_memory_size);
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &g_p : group_grads_params->at(i)) {
+        auto iter = var_nodes.find(g_p.second);
+        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
+        auto shape = iter->second->Var()->GetShape();
+        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
+        std::for_each(shape.begin(), shape.end(),
+                      [&size](const int64_t &n) { size *= n; });
+        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const {
+    if (FLAGS_fuse_parameter_groups_size == 1) {
+      return;
+    }
+    size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
+    if (FLAGS_fuse_parameter_groups_size == -1) {
+      group_size = group_grads_params->size();
+    }
+    PADDLE_ENFORCE_GT(group_size, 1);
+    size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
+    GroupGradsAndParams local_group_grads_params;
+    local_group_grads_params.reserve(groups);
+
+    size_t j = 0;
+    for (size_t i = 0; i < groups; ++i) {
+      local_group_grads_params.emplace_back();
+      auto &group_p_g = local_group_grads_params.back();
+      group_p_g.reserve(group_size);
+      while (j < group_grads_params->size()) {
+        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                         group_grads_params->at(j).end());
+        ++j;
+        if (j % group_size == 0) break;
+      }
+    }
+    std::swap(*group_grads_params, local_group_grads_params);
+
+    VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
+             << "): ";
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+ private:
+  bool IsSupportedVarType(const proto::VarType::Type &type) const {
+    // Current only support LOD_TENSOR.
+    return type == proto::VarType::LOD_TENSOR;
+  }
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const {
+    auto op_desc = global_block->AppendOp();
+    op_desc->SetType("alloc_continuous_space");
+    op_desc->SetInput("Input", params_name);
+    op_desc->SetOutput("Output", grads_name);
+    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  }
+
+  void RecordParamsAndGrads(ir::Node *node,
+                            ParamsAndGrads *params_grads) const {
+    try {
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) return;
+
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once.
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
+
+      for (size_t i = 0; i < backward_vars.size(); i += 2) {
+        VLOG(10) << "Trainable parameter: " << backward_vars[i]
+                 << ", gradient: " << backward_vars[i + 1];
+
+        params_grads->emplace_back(std::make_pair(
+            backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
+      }
+    } catch (boost::bad_get e) {
+    }
+  }
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::unordered_map<std::string, ir::Node *> &vars,
+      const std::string &fused_var_name,
+      const ParamsAndGrads &params_grads) const {
+    //  Init Gradients and FusedVars
+    VLOG(10) << "Init FusedVars and Gradients.";
+    for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+      auto &scope = *it;
+
+      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                     "%s has existed in scope.", fused_var_name);
+      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+
+      for (auto &p_g : params_grads) {
+        auto iter = vars.find(p_g.second);
+        PADDLE_ENFORCE(iter != vars.end());
+        PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+        PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
+                          proto::VarType::LOD_TENSOR);
+        scope->Var(p_g.second)->GetMutable<LoDTensor>();
+      }
+    }
+
+    std::vector<std::string> grads_name;
+    std::vector<std::string> params_name;
+    grads_name.reserve(params_grads.size());
+    params_name.reserve(params_grads.size());
+    for (auto &p_g : params_grads) {
+      params_name.emplace_back(p_g.first);
+      grads_name.emplace_back(p_g.second);
+    }
+    framework::ProgramDesc program_desc;
+    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
+                              program_desc.MutableBlock(0));
+
+    // Run Only Once Programs
+    for (size_t i = 0; i < local_scopes.size(); ++i) {
+      for (auto &op_desc : program_desc.Block(0).AllOps()) {
+        auto op = OpRegistry::CreateOp(*op_desc);
+        op->Run(*local_scopes[i], places[i]);
+      }
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(alloc_continuous_space_for_grad_pass,
+              paddle::framework::details::AllocContinuousSpaceForGradPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {

  std::string Name() const override;

-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };

 protected:
  void RunImpl() override;

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -46,7 +46,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
    if (strategy_.enable_sequential_execution_) {
+      VLOG(10) << "Add sequential_execution_pass";
      AppendPass("sequential_execution_pass");
    }

@@ -57,6 +66,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {

    // Add op fusion.
    if (strategy.fuse_relu_depthwise_conv_) {
+      VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
      AppendPass("fuse_relu_depthwise_conv_pass");
    }

@@ -68,27 +78,28 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {

    // Add automatically inplace.
    if (strategy_.enable_inplace_) {
+      VLOG(10) << "Add inplace_pass";
      AppendPass("inplace_pass");
    }

-    // Add a graph viz pass to record a graph.
-    if (!strategy_.debug_graphviz_path_.empty()) {
-      auto viz_pass = AppendPass("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    if (strategy.fuse_elewise_add_act_ops_) {
+      VLOG(10) << "Add fuse_elewise_add_act_pass";
+      AppendPass("fuse_elewise_add_act_pass");
+    }
+
+    // for single card training, fuse_all_reduce_ops is unnecessary.
+    // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
+    if (strategy.fuse_all_reduce_ops_) {
+      VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+      AppendPass("alloc_continuous_space_for_grad_pass");
    }

-    if (strategy.fuse_elewise_add_act_ops_) {
-      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
    // Add a graph viz pass to record a graph.
    if (!strategy.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
      const std::string graph_path = string::Sprintf(
          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-        viz_pass->Set<std::string>("graph_viz_path",
-                                   new std::string(graph_path));
-      }
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
    }

    CollectiveContext *context = CollectiveContext::GetInstance();
@@ -108,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // A side-effect of that, memory optimize cannot forsee the fetched vars
    // , so fetchlist should be set persistable before call the Run interface.
    if (strategy.memory_optimize_) {
-      auto memory_optimize_pass = AppendPass("memory_optimize_pass");
+      VLOG(10) << "Add memory_optimize_pass";
+      AppendPass("memory_optimize_pass");
    }

    AppendMultiDevPass(strategy);

+    if (strategy.fuse_all_reduce_ops_) {
+      // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
+      // first, if the number is zero, fuse_all_reduce_ops will do nothing.
+      VLOG(10) << "Add fuse_all_reduce_op_pass";
+      AppendPass("fuse_all_reduce_op_pass");
+    }
+
    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
@@ -128,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");

+    if (VLOG_IS_ON(2)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
    if (SeqOnlyAllReduceOps(strategy)) {
+      VLOG(10) << "Add all_reduce_deps_pass";
      AppendPass("all_reduce_deps_pass");
    }

    if (strategy_.remove_unnecessary_lock_) {
+      VLOG(10) << "Add modify_op_lock_and_record_event_pass";
      AppendPass("modify_op_lock_and_record_event_pass");
    }
  }

  // Convert graph to run on multi-devices.
  void AppendMultiDevPass(const BuildStrategy &strategy) {
-    ir::Pass *multi_devices_pass;
+    ir::Pass *multi_devices_pass = nullptr;
    if (strategy_.is_distribution_) {
-      VLOG(3) << "multi device parameter server mode";
+      VLOG(10) << "Add dist_multi_devices_pass";
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(3) << "multi devices collective mode with allreduce";
+        VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
        multi_devices_pass =
-            AppendPass("allreduce_mode_multi_devices_pass").get();
+            AppendPass("all_reduce_mode_multi_devices_pass").get();
      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(3) << "multi deivces collective mode with reduce";
+        VLOG(10) << "Add reduce_mode_multi_devices_pass";
        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
      } else {
        PADDLE_THROW("Unknown reduce strategy.");
@@ -206,9 +231,26 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(

 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase("nccl_ctxs");
-      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
+    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+#endif
+    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
@@ -239,7 +281,7 @@ USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);
-USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(all_reduce_mode_multi_devices_pass);
 USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
@@ -249,4 +291,6 @@ USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
+USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
+USE_PASS(fuse_all_reduce_op_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -16,6 +16,7 @@

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/ir/pass_builder.h"
@@ -75,6 +76,8 @@ struct BuildStrategy {

  bool fuse_elewise_add_act_ops_{false};

+  bool fuse_all_reduce_ops_{false};
+
  bool fuse_relu_depthwise_conv_{false};

  bool sync_batch_norm_{false};

--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  if (ctxs) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-  }
-}
-#else
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-std::string DataBalanceOpHandle::Name() const { return "data balance"; }
-
-std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
-    const std::vector<int> &device_sizes) {
-  int device_num = device_sizes.size();
-  int total_size = 0;
-  int empty_num = 0;
-  std::vector<std::array<int, 2>> size_device_vec;
-  size_device_vec.reserve(device_num);
-  for (int i = 0; i < device_num; ++i) {
-    if (device_sizes[i] == 0) {
-      ++empty_num;
-    }
-    total_size += device_sizes[i];
-    size_device_vec.push_back({{device_sizes[i], i}});
-  }
-  std::vector<std::array<int, 3>> res;
-  if (empty_num == 0) {
-    // No need to do data balance.
-    return res;
-  }
-  if (total_size < device_num) {
-    // No enough data.
-    PADDLE_THROW_EOF();
-  }
-  std::sort(size_device_vec.begin(), size_device_vec.end(),
-            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
-              return a[0] > b[0];
-            });
-  int expected_device_size = total_size / device_num;
-  int src_idx = 0;
-  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
-    if (size_device_vec[src_idx][0] <= expected_device_size) {
-      ++src_idx;
-      PADDLE_ENFORCE_LT(
-          src_idx, device_num - empty_num,
-          "In current srategy an empty tensor should not be copy source.");
-    }
-    size_device_vec[src_idx][0] -= expected_device_size;
-    size_device_vec[dst_idx][0] += expected_device_size;
-    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
-                    expected_device_size}});
-  }
-  return res;
-}
-
-void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1UL,
-                    "Data balance can only be enabled when the number of "
-                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  int data_num = in_var_handles.size() / places_.size();
-  WaitInputVarGenerated();
-  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
-  std::vector<int> device_sizes;
-  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-    int place_idx = i / data_num;
-    int data_idx = i % data_num;
-    auto *local_scope =
-        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
-    auto *tensor = tensor_var->GetMutable<LoDTensor>();
-    lod_tensors[data_idx].push_back(tensor);
-    int ins_size =
-        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
-    if (data_idx == 0) {
-      device_sizes.emplace_back(ins_size);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          ins_size, device_sizes.at(place_idx),
-          "All data on the same device shall have the same batch size.");
-    }
-  }
-  const auto &balance_plan = GetBalancePlan(device_sizes);
-
-  for (const auto &trans : balance_plan) {
-    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
-      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
-      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
-      int trans_ins_size = trans[2];
-      LoD src_lod = src_tensor->lod();
-      int src_ins_size =
-          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
-      int cut_point = src_ins_size - trans_ins_size;
-      if (!src_lod.empty()) {
-        for (auto &level : src_lod) {
-          cut_point = level[cut_point];
-        }
-      }
-      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
-                     dst_tensor->place(), dst_tensor);
-      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
-      if (!src_lod.empty()) {
-        dst_tensor->set_lod(SliceInLevel(
-            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
-        src_tensor->set_lod(
-            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
-      }
-    }
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
  }
 }

+bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }

 }  // namespace details

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {

  std::string Name() const override;

+  bool IsMultiDeviceTransfer() override;
+
 protected:
  void RunImpl() override;


--- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseAllReduceOpPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    ir::Graph &result = *graph;
+
+    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *nccl_ctxs = &Get<platform::NCCLContextMap>(kNCCLCtxs);
+#endif
+
+    std::unordered_set<std::string> grads;
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    size_t num_of_all_reduce = params_grads.size();
+    grads.reserve(num_of_all_reduce);
+    for (auto p_g : params_grads) {
+      grads.insert(p_g.second);
+    }
+
+    size_t num_place = places.size();
+    std::unordered_map<std::string, ir::Node *> all_reduce_ops;
+    all_reduce_ops.reserve(grads.size());
+    for (auto &node : result.Nodes()) {
+      if (node->IsOp()) {
+        PADDLE_ENFORCE(node->IsWrappedBy<OpHandleBase>());
+        auto *all_reduce_op_handle =
+            dynamic_cast<AllReduceOpHandle *>(&node->Wrapper<OpHandleBase>());
+        if (all_reduce_op_handle) {
+          auto inputs = DynamicCast<VarHandle>(all_reduce_op_handle->Inputs());
+          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
+          // The inputs' name should be the same.
+          auto &grad_name = inputs[0]->name();
+          for (size_t i = 1; i < inputs.size(); ++i) {
+            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
+                              "The input name should be the same.");
+          }
+          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
+          all_reduce_ops.emplace(grad_name, node);
+        }
+      }
+    }
+
+    VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
+    if (all_reduce_ops.size() == 0) {
+      return std::move(graph);
+    }
+
+    PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
+                      "The number of all_reduce OpHandle is not equal to the "
+                      "number of grads. Maybe some gradients are sparse type, "
+                      "it is not supported currently.");
+    VLOG(10) << "Insert fused_all_reduce";
+
+    auto &group_grads_params =
+        graph->Get<GroupGradsAndParams>(kGroupGradsAndParams);
+
+    for (auto &group_g_p : group_grads_params) {
+      size_t group_size = group_g_p.size();
+      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
+      std::vector<ir::Node *> group_all_reduce_ops;
+      group_all_reduce_ops.reserve(group_size);
+      for (auto &g_p : group_g_p) {
+        group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
+      }
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, nccl_ctxs, &result);
+#else
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, &result);
+#endif
+    }
+    return std::move(graph);
+  }
+
+  void InsertFusedAllReduce(const std::vector<platform::Place> &places,
+                            const std::vector<Scope *> &local_scopes,
+                            const size_t num_of_all_reduce,
+                            const std::vector<ir::Node *> &all_reduce_ops,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                            const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                            ir::Graph *result) const {
+    std::vector<VarHandleBase *> inputs;
+    std::vector<VarHandleBase *> outputs;
+    for (auto &op : all_reduce_ops) {
+      auto &op_handle = op->Wrapper<OpHandleBase>();
+      inputs.insert(inputs.end(), op_handle.Inputs().begin(),
+                    op_handle.Inputs().end());
+      // Remove output
+      for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
+               [&op_handle](VarHandleBase *var_handle) {
+                 var_handle->RemoveOutput(&op_handle, op_handle.Node());
+               });
+
+      outputs.insert(outputs.end(), op_handle.Outputs().begin(),
+                     op_handle.Outputs().end());
+      // Remove Input
+      for_each(
+          op_handle.Outputs().begin(), op_handle.Outputs().end(),
+          [](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); });
+
+      result->RemoveNode(op_handle.Node());
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, nccl_ctxs, result);
+#else
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, result);
+#endif
+  }
+
+ private:
+  void CreateFusedAllReduceOp(const std::vector<VarHandleBase *> &inputs,
+                              const std::vector<VarHandleBase *> &outputs,
+                              const size_t num_of_all_reduce,
+                              const std::vector<platform::Place> &places,
+                              const std::vector<Scope *> &local_scopes,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                              const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                              ir::Graph *result) const {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce, nccl_ctxs);
+#else
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce);
+#endif
+
+    for (auto in : inputs) {
+      op_handle->AddInput(in);
+    }
+
+    for (auto out : outputs) {
+      op_handle->AddOutput(out);
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    if (!nccl_ctxs) {
+      SetCommunicationContext(places, op_handle);
+    }
+#else
+    SetCommunicationContext(places, op_handle);
+#endif
+  }
+
+  void SetCommunicationContext(const std::vector<platform::Place> &places,
+                               FusedAllReduceOpHandle *op_handle) const {
+    for (size_t i = 0; i < places.size(); ++i) {
+      op_handle->SetDeviceContext(
+          places[i], platform::DeviceContextPool::Instance().Get(places[i]));
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_all_reduce_op_pass,
+              paddle::framework::details::FuseAllReduceOpPass);
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void FuseVarsOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
-  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
-
-  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-  auto out_var_handle = out_var_handles[0];
-  auto out_var = scope->Var(out_var_handle->name());
-
-  auto out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
-
-  int64_t s = 0;
-  for (size_t i = 1; i < out_var_handles.size(); ++i) {
-    auto out_name = out_var_handles[i]->name();
-    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
-    auto numel = this->inputs_numel_.at(out_name);
-    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
-    s += numel;
-  }
-  this->RunAndRecordEvent([] {});
-}
-
-std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FuseVarsOpHandle : public OpHandleBase {
- public:
-  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
-                   const platform::Place &place,
-                   const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const proto::VarType::Type var_type)
-      : OpHandleBase(node),
-        local_scope_(local_scope),
-        place_(place),
-        inputs_numel_(inputs_numel),
-        type_(var_type) {
-    total_numel_ = 0;
-    for (auto in_numel : inputs_numel) {
-      PADDLE_ENFORCE_GT(in_numel.second, 0);
-      total_numel_ += in_numel.second;
-    }
-  }
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  Scope *local_scope_;
-  const platform::Place place_;
-  const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const proto::VarType::Type type_;
-  int64_t total_numel_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(skip_fused_all_reduce_check, false, "");
+namespace paddle {
+namespace framework {
+namespace details {
+
+typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
+    GradientAndLoDTensor;
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
+    const platform::NCCLContextMap *ctxs)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce),
+      nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
+    }
+  }
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+#else
+
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+
+#endif
+
+void FusedAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+
+  VLOG(4) << this->DebugString();
+
+  WaitInputVarGenerated();
+  // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+
+  size_t place_num = places_.size();
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), place_num * num_of_all_reduce_,
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  GradientAndLoDTensor grads_tensor;
+  grads_tensor.resize(place_num);
+
+  int64_t numel = -1;
+  auto dtype = static_cast<framework::proto::VarType::Type>(0);
+  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+    auto &g_tensor = grads_tensor.at(scope_idx);
+    g_tensor.reserve(num_of_all_reduce_);
+
+    GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
+
+    int64_t element_num = 0;
+    framework::proto::VarType::Type ele_dtype =
+        static_cast<framework::proto::VarType::Type>(0);
+    GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
+
+    if (numel == -1) {
+      numel = element_num;
+    }
+    if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
+      dtype = ele_dtype;
+      PADDLE_ENFORCE_NE(ele_dtype,
+                        static_cast<framework::proto::VarType::Type>(0));
+    }
+    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+
+    // Check whether the address space is contiguous.
+    std::sort(
+        g_tensor.begin(), g_tensor.end(),
+        [](const std::pair<std::string, const LoDTensor *> &grad1,
+           const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
+          return grad1.second->data<void>() < grad2.second->data<void>();
+        });
+
+    for (size_t k = 1; k < g_tensor.size(); ++k) {
+      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
+      int64_t len = g_tensor.at(k - 1).second->numel();
+      auto offset = len * framework::SizeOfType(dtype);
+      void *infer_next_address = reinterpret_cast<void *>(
+          reinterpret_cast<uintptr_t>(cur_address) + offset);
+      const void *next_address = g_tensor.at(k).second->data<void>();
+
+      VLOG(10) << string::Sprintf(
+          "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
+          "input[%d] address: 0X%02x. The offset: %d",
+          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
+          next_address, k, infer_next_address, offset);
+      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
+                        "The address is not consistent.");
+    }
+  }
+
+  if (!FLAGS_skip_fused_all_reduce_check) {
+    for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+      for (size_t j = 1; j < num_of_all_reduce_; ++j) {
+        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
+                          grads_tensor.at(scope_idx).at(j).first);
+      }
+    }
+  }
+
+  std::vector<const void *> lod_tensor_data;
+  for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
+    lod_tensor_data.emplace_back(data);
+  }
+
+  if (platform::is_gpu_place(places_[0])) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    int nccl_dtype = platform::ToNCCLDataType(dtype);
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
+
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
+            ncclSum, comm, stream));
+      });
+    }
+
+    this->RunAndRecordEvent([&] {
+      if (all_reduce_calls.size() == 1UL) {
+        // Do not use NCCLGroup when manage NCCL by per thread per device
+        all_reduce_calls[0]();
+      } else {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      }
+    });
+#else
+    PADDLE_THROW("Not compiled with CUDA");
+#endif
+  } else {
+    // Special handle CPU only Operator's gradient. Like CRF
+    auto grad_name = grads_tensor.at(0).at(0).first;
+    auto &trg = *this->local_scopes_[0]
+                     ->FindVar(kLocalExecScopeName)
+                     ->Get<Scope *>()
+                     ->FindVar(grad_name)
+                     ->GetMutable<framework::LoDTensor>();
+
+    // Reduce All data to trg in CPU
+    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    VisitDataType(trg.type(), func);
+
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
+      auto &scope =
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &p = places_[i];
+      auto *var = scope.FindVar(grad_name);
+      auto *dev_ctx = dev_ctxes_.at(p);
+      size_t size = numel * SizeOfType(trg.type());
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        platform::CPUPlace cpu_place;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+      });
+    }
+  }
+}
+
+void FusedAllReduceOpHandle::GetGradLoDTensor(
+    const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<VarHandle *> &out_var_handles,
+    std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
+  auto *local_scope =
+      local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  size_t place_num = places_.size();
+
+  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
+    auto var_name = in_var_handles[j]->name();
+    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
+    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
+  }
+}
+
+void FusedAllReduceOpHandle::GetDTypeAndNumel(
+    const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
+    proto::VarType::Type *dtype, int64_t *numel) const {
+  *numel = 0;
+  for (size_t i = 0; i < grad_tensor.size(); ++i) {
+    // Get element number
+    int64_t len = grad_tensor.at(i).second->numel();
+    PADDLE_ENFORCE_GT(len, 0);
+    *numel += len;
+
+    // Get dtype
+    auto ele_type = grad_tensor.at(i).second->type();
+    if (i == 0) {
+      *dtype = ele_type;
+    }
+    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+  }
+}
+
+std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #pragma once

 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -27,31 +28,47 @@ namespace paddle {
 namespace framework {
 namespace details {

-struct DataBalanceOpHandle : public OpHandleBase {
- public:
+struct FusedAllReduceOpHandle : public OpHandleBase {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+  FusedAllReduceOpHandle(ir::Node *node,
+                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
+                         const size_t num_of_all_reduce,
                         const platform::NCCLContextMap *ctxs);
 #else
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places);
+  FusedAllReduceOpHandle(ir::Node *node,
+                         const std::vector<Scope *> &local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const size_t num_of_all_reduce);
 #endif
-
  std::string Name() const override;

-  bool IsMultiDeviceTransfer() override { return false; };
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };

 protected:
  void RunImpl() override;

 private:
-  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
-  std::vector<std::array<int, 3>> GetBalancePlan(
-      const std::vector<int> &batch_size_per_device);
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  size_t num_of_all_reduce_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
+  // Check the dtype of the input
+  void GetDTypeAndNumel(
+      const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
+      proto::VarType::Type *dtype, int64_t *total_num) const;

-  const std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> places_;
+  // Get gradient's name and LoDTensor
+  void GetGradLoDTensor(const size_t &scope_idx,
+                        const std::vector<VarHandle *> &in_var_handles,
+                        const std::vector<VarHandle *> &out_var_handles,
+                        std::vector<std::pair<std::string, const LoDTensor *>>
+                            *grad_tensor) const;
 };

 }  // namespace details

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -11,18 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@@ -134,21 +135,26 @@ void AddOutputToLeafOps(ir::Graph *graph) {
 }
 }  // namespace

+void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {}
+
 void MultiDevSSAGraphBuilderBase::Init() const {
  all_vars_.clear();

  loss_var_name_ = Get<const std::string>(kLossVarName);
+  VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_;
  places_ = Get<const std::vector<platform::Place>>(kPlaces);
  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
  strategy_ = Get<const BuildStrategy>(kStrategy);
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>(kNCCLCtxs);
 #endif
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }

 std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  Init();
+  CheckGraph(*graph);
  std::vector<ir::Node *> sorted_ops = SortOperations(*graph);

  auto nodes = graph->ReleaseNodes();
@@ -166,7 +172,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
  result.Set(kGraphOps, new GraphOps);

  bool is_forwarding = true;
-  bool insert_collection_ops = NeedCollectiveOps();

  for (ir::Node *node : sorted_ops) {
    if (DealWithSpecialOp(&result, node)) {
@@ -185,8 +190,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
        CreateComputationalOps(&result, node, places_.size());
      }

-      // Insert collection ops
-      if (!is_forwarding && insert_collection_ops) {
+      // Insert collective ops if nranks > 1
+      if (!is_forwarding && Get<size_t>(kNRanks) > 1) {
        try {
          bool is_bk_op =
              static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -200,14 +205,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
          PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
          for (size_t i = 0; i < backward_vars.size(); i += 2) {
            auto &p_name = backward_vars[i];
            auto &g_name = backward_vars[i + 1];
            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
+            if (NeedCollectiveForGrad(g_name, sorted_ops)) {
              InsertCollectiveOp(&result, p_name, g_name);
            }
+          }
        } catch (boost::bad_get e) {
        }
      }
@@ -226,6 +231,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
+
  result.Erase(kGraphOps);
  return graph;
 }
@@ -258,6 +264,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
  }
 }

+bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result,
+                                                    ir::Node *node) const {
+  return false;
+}
+
 std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
    const ir::Graph &graph) const {
  return ir::TopologySortOperations(graph);
@@ -271,8 +282,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const {
  return use_gpu;
 }

-bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const {
-  return Get<size_t>(kNRanks) > 1;
+bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
+    const std::string &grad_name, std::vector<ir::Node *> ops) const {
+  // if we have allreduce_op for current gradient variable in the graph,
+  // then we don't need to add allreduce_op_handle for this gradient
+  // NOTE: This is for the case that all gradients should add collective ops
+  for (auto *node : ops) {
+    if (node->Op()->Type() != "allreduce") continue;
+    for (auto in_name : node->Op()->InputArgumentNames()) {
+      if (in_name == grad_name) {
+        return false;
+      }
+    }
+  }
+  return true;
 }

 void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
@@ -496,20 +519,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
 }

 bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
-  return boost::get<int>(
+  return !loss_var_name_.empty() && node->Op() &&
+         boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
             (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss)) &&
-         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+              static_cast<int>(OpRole::kLoss));
 }

 bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
    const std::string &og) const {
  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
-  }
-  return false;
+  return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
 }

 void AllReduceSSAGraphBuilder::InsertCollectiveOp(
@@ -995,7 +1015,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
 REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
                            paddle::framework::details::ReduceSSAGraphBuilder);
 REGISTER_MULTI_DEVICES_PASS(
-    allreduce_mode_multi_devices_pass,
+    all_reduce_mode_multi_devices_pass,
    paddle::framework::details::AllReduceSSAGraphBuilder);
 REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
                            paddle::framework::details::DistSSAGraphBuilder);
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -14,7 +14,10 @@

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>

@@ -31,12 +34,6 @@ namespace framework {
 class Scope;
 namespace details {

-constexpr char kLossVarName[] = "loss_var_name";
-constexpr char kPlaces[] = "places";
-constexpr char kLocalScopes[] = "local_scopes";
-constexpr char kStrategy[] = "strategy";
-constexpr char kNRanks[] = "nranks";
-
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
@@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {

  virtual void Init() const;

+  virtual void CheckGraph(const ir::Graph &graph) const;
+
  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;

  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                  const std::string &g_name) const = 0;

-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;

  virtual void InsertPostprocessOps(ir::Graph *result) const = 0;

  bool UseGPU() const;

-  bool NeedCollectiveOps() const;
+  bool NeedCollectiveForGrad(const std::string &grad_name,
+                             std::vector<ir::Node *> ops) const;

  bool IsScaleLossOp(ir::Node *node) const;

@@ -109,10 +109,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                  const std::string &g_name) const;

-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
-    return false;
-  }
-
  virtual void InsertPostprocessOps(ir::Graph *result) const {}
 };


--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -16,6 +16,9 @@

 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/details/op_handle_base.h"
@@ -44,6 +47,26 @@ const char kGraphVars[] = "vars";
 typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";

+constexpr char kNCCLCtxs[] = "nccl_ctxs";
+
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kPlaces[] = "places";
+constexpr char kLocalScopes[] = "local_scopes";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
+typedef std::unordered_set<std::string> FusedVars;
+constexpr char kFusedVars[] = "fused_vars";
+
+typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
+constexpr char kParamsAndGrads[] = "params_grads";
+
+typedef std::vector<std::vector<std::pair<std::string, std::string>>>
+    GroupGradsAndParams;
+constexpr char kGroupGradsAndParams[] = "group_grads_params";
+
+constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include <map>
+#include <unordered_set>

 namespace paddle {
 namespace framework {
@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {

 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
    for (auto &p : dev_ctxes_) {
      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
      PADDLE_ENFORCE(cudaSetDevice(dev_id));
      PADDLE_ENFORCE(
          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
    }
+    if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          int dev_id =
+              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
+                        "%s should have only one dev_ctx.", Name());
+      auto &place = dev_ctxes_.begin()->first;
+      int dev_id = boost::get<platform::CUDAPlace>(place).device;
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          PADDLE_ENFORCE(
+              platform::is_same_place(place, out_var_handle->place()),
+              "The place of input(%s) is not consistent with the "
+              "place of current op(%s).",
+              out_var_handle->Name(), Name());
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    }
  }
 #else
+
  PADDLE_ENFORCE(!use_cuda);
 #endif

@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
 void OpHandleBase::WaitInputVarGenerated() {
  for (auto in_var : inputs_) {
    if (NeedWait(in_var)) {
-      for (auto &pair : dev_ctxes_) {
-        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        auto &place = in_var_handle->place();
+        if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream =
+              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
+                  ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
      }
    }
  }
 }

 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
-  for (auto *in : inputs_) {
-    if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        if (platform::is_gpu_place(in_var_handle->place())) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream = static_cast<platform::CUDADeviceContext *>(
+                            dev_ctxes_.at(in_var_handle->place()))
+                            ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
+      }
    }
  }
 }

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,6 +53,31 @@ struct ReduceLoDTensor {
  }
 };

+struct ReduceBufferData {
+  const std::vector<const void *> &src_data_;
+  void *dst_data_;
+  int64_t numel_;
+
+  ReduceBufferData(const std::vector<const void *> &src, void *dst,
+                   int64_t numel)
+      : src_data_(src), dst_data_(dst), numel_(numel) {}
+
+  template <typename T>
+  void apply() const {
+    T *dst_data = reinterpret_cast<T *>(dst_data_);
+    for (size_t i = 0; i < src_data_.size(); ++i) {
+      auto srd_data = reinterpret_cast<const T *>(src_data_[i]);
+      VLOG(10) << "dst: " << dst_data_ << ", " << srd_data;
+      if (srd_data == dst_data_) {
+        continue;
+      }
+
+      std::transform(srd_data, srd_data + numel_, dst_data, dst_data,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
 inline void GatherLocalSelectedRows(
    const std::vector<const SelectedRows *> &src_selecte_rows_,
    const std::vector<platform::Place> &in_places,

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,7 +14,6 @@

 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

-#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    : graph_(graph),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
+      prepare_pool_(1),
      local_scopes_(local_scopes),
      places_(places),
      fetch_ctxs_(places),
-      running_ops_(0),
-      strategy_(strategy) {}
+      strategy_(strategy) {
+  PrepareOpDeps();
+  CopyOpDeps();
+}

 FeedFetchList ThreadedSSAGraphExecutor::Run(
    const std::vector<std::string> &fetch_tensors) {
  std::unique_ptr<platform::RecordEvent> event(
      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
-  std::unordered_set<OpHandleBase *> ready_ops;
+  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
+  CopyOpDeps();
+  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
+  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
+      new BlockingQueue<VarHandleBase *>);
+  auto &pending_ops = op_deps->pending_ops_;
+  auto &pending_vars = op_deps->pending_vars_;
+  auto &ready_ops = op_deps->ready_ops_;
+
  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
  // streams from multiple GPUs, it's faster to buffer them and schedule
  // together since we currently cannot overlap computation and memcpy streams.
  // Should revisit it if overlapping is available.
  std::unordered_set<OpHandleBase *> delayed_ops;

-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var);
-  }
-
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
-
  // Step 2. Insert FetchOps
  std::vector<FetchOpHandle *> fetch_ops;
  std::unordered_set<VarHandleBase *> fetch_dependencies;
  FeedFetchList fetch_data(fetch_tensors.size());

-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, ready_vars.get(), &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
+                 &pending_ops, &pending_vars, &fetch_data);

  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
    for (auto *op : set) {
-      running_ops_++;
      RunOp(ready_vars, op);
    }
    set.clear();
  };
-
+  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
  // Clean run context
  run_op_futures_.clear();
  exception_holder_.Clear();
@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  while (!pending_vars.empty()) {
    // 1. Run All Ready ops
    // Keep loop until all vars are ready.
-    //
-    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
-    // ready_ops have been performed.
-    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
-      run_all_ops(delayed_ops);
-    } else {
    run_all_ops(ready_ops);
-    }

    // 2. Find ready variable
    bool timeout;
    auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-
    if (timeout) {
      if (exception_holder_.IsCaught()) {
        for (auto &run_op_future : run_op_futures_) {
@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        continue;
      }
    }
+
    // 3. Remove the dependency of ready_var.
    // Find the ready_ops after the ready_var.
    for (auto ready_var : cur_ready_vars) {
@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        auto &deps = pending_ops[op];
        --deps;
        if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
-            delayed_ops.insert(op);
-          } else {
-            ready_ops.insert(op);
-          }
+          run_all_op(op);
        }
      }
    }
@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    const std::vector<std::string> &fetch_tensors,
    std::vector<FetchOpHandle *> *fetch_ops,
    std::unordered_set<VarHandleBase *> *fetch_dependencies,
+    std::unordered_set<OpHandleBase *> *ready_ops,
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+    FeedFetchList *fetch_data) {
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-
+  std::unordered_set<VarHandleBase *> local_ready_vars;
  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
+        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
      }
    }
  }
@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    auto &var_name = fetch_tensors[i];
    auto fetched_var_it = fetched_vars.find(var_name);
    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);

    auto &vars = fetched_var_it->second;

@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    auto *fetch_dummy = new DummyVarHandle(fetch_var);
    op->AddOutput(fetch_dummy);
    fetch_dependencies->emplace(fetch_dummy);
-    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
-    this->InsertPendingOp(pending_ops, op);
+
+    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
+
+    size_t wait_input_num = 0;
+    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
+    for (auto *var : input_set) {
+      if (pending_vars->count(var)) {
+        ++wait_input_num;
+      }
+    }
+    if (wait_input_num) {
+      pending_ops->insert({op, wait_input_num});
+    } else {
+      ready_ops->insert(static_cast<OpHandleBase *>(op));
+    }
  }
+  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
 }

 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -197,13 +188,65 @@ void ThreadedSSAGraphExecutor::InsertPendingOp(

 void ThreadedSSAGraphExecutor::InsertPendingVar(
    std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+    std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
  pending_vars->insert(var);
  if (var->GeneratedOp() == nullptr) {
-    ready_vars->Push(var);
+    ready_vars->insert(var);
  }
 }

+void ThreadedSSAGraphExecutor::PrepareOpDeps() {
+  op_deps_.reset(new OpDependentData());
+  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
+      op_deps_->pending_ops_;
+  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
+  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
+  std::unordered_set<VarHandleBase *> ready_vars;
+
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
+      }
+    }
+  }
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
+    InsertPendingVar(&pending_vars, &ready_vars, var);
+  }
+
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op);
+    } else {
+      InsertPendingOp(&pending_ops, op);
+    }
+  }
+  for (auto ready_var : ready_vars) {
+    pending_vars.erase(ready_var);
+    for (auto *op : ready_var->PendingOps()) {
+      auto &deps = pending_ops[op];
+      --deps;
+      if (deps == 0) {
+        ready_ops.insert(op);
+      }
+    }
+  }
+}
+
+void ThreadedSSAGraphExecutor::CopyOpDeps() {
+  op_deps_futures_ = prepare_pool_.enqueue([&] {
+    auto *op_deps = new OpDependentData();
+    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
+                                 op_deps_->pending_ops_.end());
+    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
+                                  op_deps_->pending_vars_.end());
+    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
+                               op_deps_->ready_ops_.end());
+    return std::unique_ptr<OpDependentData>(op_deps);
+  });
+}
+
 void ThreadedSSAGraphExecutor::RunOp(
    const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
    details::OpHandleBase *op) {
@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp(
        op->Run(strategy_.use_cuda_);
      }
      VLOG(10) << op << " " << op->Name() << " Done ";
-      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << " Signal posted";
    } catch (...) {

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,18 +15,20 @@
 #pragma once

 #include <deque>
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"

@@ -36,6 +38,12 @@ class Scope;

 namespace details {

+struct OpDependentData {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
+  std::unordered_set<VarHandleBase *> pending_vars_;
+  std::unordered_set<OpHandleBase *> ready_ops_;
+};
+
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 public:
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 private:
  ir::Graph *graph_;
  std::unique_ptr<::ThreadPool> pool_;
+  ::ThreadPool prepare_pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
  ExceptionHolder exception_holder_;
-  std::atomic<int> running_ops_;

  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                       OpHandleBase *op_instance) const;

  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        std::unordered_set<VarHandleBase *> *ready_vars,
                        VarHandleBase *var) const;

  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
                      std::vector<FetchOpHandle *> *fetch_ops,
                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_set<OpHandleBase *> *ready_ops,
                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                      std::unordered_set<VarHandleBase *> *pending_vars,
-                      BlockingQueue<VarHandleBase *> *ready_vars,
                      FeedFetchList *fetch_data);

+  void PrepareOpDeps();
+  void CopyOpDeps();
+
 private:
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
+
  ExecutionStrategy strategy_;
+  std::unique_ptr<OpDependentData> op_deps_;
  // use std::list because clear(), push_back, and for_each are O(1)
  std::list<std::future<void>> run_op_futures_;
 };

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -43,6 +43,7 @@ struct VarHandleBase {
  virtual ~VarHandleBase();

  virtual std::string DebugString() const = 0;
+  virtual const std::string& Name() const = 0;

  void AddInput(OpHandleBase* in, ir::Node* node) {
    node_->inputs.clear();
@@ -95,8 +96,6 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
-  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
-
  virtual ~VarHandle();

  std::string DebugString() const override;
@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase {
        name_(std::move(name)),
        place_(std::move(place)) {}

+#ifdef PADDLE_WITH_CUDA
+  bool HasEvent() { return has_event_; }
+
+  const cudaEvent_t& GetEvent() {
+    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    return event_;
+  }
+
+  void SetGenerateEvent(const cudaEvent_t& event) {
+    has_event_ = true;
+    event_ = event;
+  }
+#endif
+
  // version field currently is not used, however, just store the version to
  // debug easily.
 private:
@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase {
  size_t scope_idx_;
  std::string name_;
  platform::Place place_;
+#ifdef PADDLE_WITH_CUDA
+  // Only when this event is triggered, var is generated.
+  cudaEvent_t event_;
+  bool has_event_{false};
+#endif

 public:
  bool IsTheSameVar(const VarHandle& o) const {
@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase {

  size_t version() const { return version_; }
  size_t scope_idx() const { return scope_idx_; }
+  const std::string& Name() const override { return name_; }
  const std::string& name() const { return name_; }
  const platform::Place& place() const { return place_; }
 };
@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase {
  virtual ~DummyVarHandle();

  std::string DebugString() const override;
+
+ public:
+  const std::string& Name() const override { return name_; }
+  std::string name_{"DummyVar"};
 };

 }  // namespace details

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -70,6 +70,7 @@ pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
+pass_library(runtime_context_cache_pass base)

 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -224,8 +224,8 @@ std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(

  PADDLE_ENFORCE(param_scope());

+  QuantizeConv(graph.get(), false /* with_residual_data */);
  QuantizeConv(graph.get(), true /* with_residual_data */);
-  QuantizeConv(graph.get());
  QuantizePool(graph.get());

  return graph;

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -599,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) {
 bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
  PADDLE_ENFORCE(var->IsVar());
  PADDLE_ENFORCE(op->IsOp());
-  if (op->Op()->Input(argument).size() <= nth) return false;
+  if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth)
+    return false;
  return var->Name() == op->Op()->Input(argument)[nth];
 }

+bool HasInput(Node *op, const std::string &argument) {
+  PADDLE_ENFORCE(op->IsOp());
+  auto const &names = op->Op()->InputNames();
+  if (std::find(names.begin(), names.end(), argument) == names.end())
+    return false;
+  return true;
+}
+
 bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
  PADDLE_ENFORCE(var->IsVar());
  PADDLE_ENFORCE(op->IsOp());
@@ -1082,8 +1091,15 @@ PDNode *patterns::Conv::operator()() {
 PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");

-  if (!with_residual_data)
-    conv_op->assert_op_attr("fuse_residual_connection", false);
+  if (!with_residual_data) {
+    conv_op->assert_more([&](Node *x) {
+      auto node_names = x->Op()->InputNames();
+      if (!HasInput(x, "ResidualData") ||
+          x->Op()->Input("ResidualData").size() == 0)
+        return true;
+      return false;
+    });
+  }

  auto input_var = pattern->NewNode(conv_input_repr())
                       ->AsInput()

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -305,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type);
 // Check whether a var node is a op node's nth input.
 bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);

+// Check whether the op node has input of given name.
+bool HasInput(Node* op, const std::string& argument);
+
 // Tell whether a var node is a op node's nth output.
 bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);


--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -14,12 +14,16 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
 namespace framework {
 namespace ir {

+/*
+ * Specifies which operators should use MKLDNN.
+ */
 class MKLDNNPlacementPass : public Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Applies Runtime Context Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(runtime_context_cache_pass,
+              paddle::framework::ir::RuntimeContextCachePass);
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RuntimeContextCachePass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -876,7 +876,22 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(

 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
+  if (!HasAttr(kEnableCacheRuntimeContext)) {
    RuntimeContext ctx(Inputs(), Outputs(), scope);
+    RunImpl(scope, place, &ctx);
+  } else {
+    const Scope* cur_scope = &scope;
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
+    }
+    RunImpl(scope, place, runtime_ctx_.get());
+  }
+}
+
+void OperatorWithKernel::RunImpl(const Scope& scope,
+                                 const platform::Place& place,
+                                 RuntimeContext* runtime_ctx) const {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

@@ -891,7 +906,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -915,8 +930,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,

  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
+                                     &transfered_inplace_vars, runtime_ctx);

  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
@@ -927,13 +942,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  }

  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
-    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
    this->InferShape(&infer_shape_ctx);
  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(
-      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
+                                       *runtime_ctx, kernel_configs));

  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";

+/// RuntimeContext is used to relate input/output names of Operator with
+/// the corresponding variables in name scope.
+/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
+/// name scope, since the input/output names of this Op do not change in the
+/// execution, RuntimeContext could be created only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -456,6 +464,8 @@ class OperatorWithKernel : public OperatorBase {
  // same.
  proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  void RunImpl(const Scope& scope, const platform::Place& place,
+               RuntimeContext* runtime_ctx) const;

  /**
   * Transfer data from scope to a transfered scope. If there is no data need to
@@ -474,6 +484,8 @@ class OperatorWithKernel : public OperatorBase {

 protected:
  mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
+  mutable const Scope* pre_scope_ = nullptr;
 };

 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -254,18 +254,29 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
        member_->places_, nccl_id, build_strategy.num_trainers_,
        build_strategy.trainer_id_));

+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
+    if (nccl_id == nullptr) {
      dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
-    // Initialize device context's nccl comm
-    // Note, more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
+    }
    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          pool.Get(member_->places_[dev_id]));
+      if (nccl_id != nullptr) {
+        auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]);
        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      } else {
+        auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]);
+        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      }
    }
 #else
    PADDLE_THROW("Not compiled with CUDA");

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,7 +34,7 @@ DEFINE_double(
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

-DEFINE_bool(fast_eager_deletion_mode, false,
+DEFINE_bool(fast_eager_deletion_mode, true,
            "Fast eager deletion mode. If enabled, memory would release "
            "immediately without waiting GPU kernel ends.");


--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -131,6 +131,15 @@ struct Argument {
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+
+  // A set of op types to enable their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
+                      std::unordered_set<std::string>);
+
+  // A set of op IDs to exclude from enabling their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
+                      std::unordered_set<int>);
+
  // Scales for variables to be quantized
  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);


--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -60,6 +61,13 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
+    } else if (pass_name == "cpu_quantize_placement_pass") {
+      pass->Set("quantize_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->quantize_enabled_op_types()));
+      pass->Set(
+          "quantize_excluded_op_ids",
+          new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
    } else if (pass_name == "cpu_quantize_pass") {
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -202,6 +202,7 @@ void AnalysisConfig::Update() {
      // Append after the Affine_channel_conv_fuse pass.
      pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
    }
+    pass_builder()->DeletePass("runtime_context_cache_pass");
  }

  if (use_mkldnn_) {

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -80,6 +80,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add_act_fuse_pass",   //
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
+        "runtime_context_cache_pass",           //
 #endif
  });

@@ -90,6 +91,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  use_gpu_ = true;
 }

+void GpuPassStrategy::EnableQuantizer() {
+  LOG(ERROR) << "GPU not support quantization yet";
+}
+
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
  analysis_passes_.push_back(pass);
 }
@@ -115,6 +120,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
      "conv_eltwiseadd_bn_fuse_pass",  //
      "is_test_pass",                  //
      "identity_scale_op_clean_pass",  //
+      "runtime_context_cache_pass",    //
  });
  use_gpu_ = false;
 }

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -84,6 +84,10 @@ class PassStrategy : public PaddlePassBuilder {
   */
  virtual void EnableMKLDNN() {}

+  /** Enable quantize optimization
+   */
+  virtual void EnableQuantizer() {}
+
  bool use_gpu() const { return use_gpu_; }

  virtual ~PassStrategy() = default;
@@ -124,6 +128,16 @@ class CpuPassStrategy : public PassStrategy {
    use_mkldnn_ = false;
 #endif
  }
+
+  void EnableQuantizer() override {
+    if (!use_quantizer_) {
+      passes_.push_back("cpu_quantize_placement_pass");
+    }
+    use_quantizer_ = true;
+  }
+
+ protected:
+  bool use_quantizer_{false};
 };

 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -138,6 +152,7 @@ class GpuPassStrategy : public PassStrategy {
  }

  void EnableMKLDNN() override;
+  void EnableQuantizer() override;

  virtual ~GpuPassStrategy() = default;
 };

--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_binary(visualizer SRCS visualizer.cc DEPS analysis
-    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
--- a/paddle/fluid/inference/utils/visualizer.cc
+++ b/paddle/fluid/inference/utils/visualizer.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/visualizer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <fstream>
-#include <memory>
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include "paddle/fluid/platform/init.h"
-
-DEFINE_string(model_dir, "", "model directory");
-DEFINE_string(model_program_path, "", "model program path");
-DEFINE_string(model_params_path, "", "model params path");
-
-using paddle::inference::analysis::Argument;
-
-namespace paddle {
-namespace inference {
-namespace utils {
-
-void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
-
-bool Visualizer::Run() {
-  paddle::framework::InitDevices(false);
-  paddle::inference::analysis::Analyzer().Run(argument_);
-  return true;
-}
-
-}  // namespace utils
-}  // namespace inference
-}  // namespace paddle
-
-// Generate a dot file describing the structure of graph.
-// To use this tool, run command: ./visualizer [options...]
-// Options:
-//     --model_dir: the directory of model
-//     --model_program_path: the path of program
-//     --model_params_path: the path of params
-int main(int argc, char *argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  google::InitGoogleLogging(argv[0]);
-
-  paddle::inference::analysis::Argument argument;
-  argument.SetUseGPU(false);
-  argument.SetUseTensorRT(false);
-
-  if (FLAGS_model_dir.empty()) {
-    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
-      LOG(ERROR) << "Please set model_dir"
-                    " or model_program_path and model_params_path";
-      return -1;
-    } else {
-      argument.SetModelProgramPath(FLAGS_model_program_path);
-      argument.SetModelParamsPath(FLAGS_model_params_path);
-    }
-  } else {
-    argument.SetModelDir(FLAGS_model_dir);
-  }
-
-  // Only 1 pass, default filename is 0_ir_origin.dot
-  // For more details, looking for paddle::inference::analysis::IRPassManager
-  argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
-
-  std::unique_ptr<paddle::framework::Scope> scope{
-      new paddle::framework::Scope()};
-  argument.SetScopeNotOwned(
-      const_cast<paddle::framework::Scope *>(scope.get()));
-
-  paddle::inference::utils::Visualizer visualizer;
-  visualizer.SetArgument(&argument);
-  visualizer.Run();
-
-  return 0;
-}
-
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/inference/utils/visualizer.h
+++ b/paddle/fluid/inference/utils/visualizer.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/argument.h"
-
-namespace paddle {
-namespace inference {
-namespace utils {
-
-using paddle::inference::analysis::Argument;
-
-class Visualizer final {
- public:
-  Visualizer() = default;
-  ~Visualizer() = default;
-  Visualizer(const Visualizer &) = delete;
-  Visualizer &operator=(const Visualizer &) = delete;
-
-  void SetArgument(Argument *);
-  bool Run();
-
- private:
-  Argument *argument_;
-};
-
-}  // namespace utils
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -76,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_CUDA
-  auto it1 = oper.Attrs().find("use_cudnn");
-  if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kCUDNN;
-  }
-#endif
+// FIXME(liuwei1031) temporarily disable the code to unblock users
+// TODO(liuwei1031) figure out the reason behind
+// https://github.com/PaddlePaddle/Paddle/issues/16096
+// and re-enable this in the future
+// #ifdef PADDLE_WITH_CUDA
+//   auto it1 = oper.Attrs().find("use_cudnn");
+//   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+//     library = framework::LibraryType::kCUDNN;
+//   }
+// #endif
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
@@ -188,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.

+Please make sure legal input, when input a negative value closed to zero,
+you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors.
+
 $out = \sqrt{x}$

 )DOC";

--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel {
                   "Input(Bias) of AffineChannelOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of AffineChannelOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto scale_dims = ctx->GetInputDim("Scale");
+    auto b_dims = ctx->GetInputDim("Bias");
+    const framework::DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    const int64_t C = (data_layout == framework::DataLayout::kNCHW
+                           ? x_dims[1]
+                           : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale_dims[0], C);
+    PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(b_dims[0], C);
+
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", "Out");
  }
@@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
  }
 };

+class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_channel_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Scale", Input("Scale"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 template <typename T>
 using EigenArrayMap =
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
@@ -244,8 +281,7 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;

 REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
-                  ops::AffineChannelOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
 REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);

 REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -57,7 +57,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                              "elements except the specify axis.");
          } else {
            // not check -1 with other in compile time
-            if (out_dims[j] != -1 && ins[i][j] != -1) {
+            if (out_dims[j] > 0 && ins[i][j] > 0) {
              PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                                "Input tensors should have the same "
                                "elements except the specify axis.");

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include <memory>
 #include <string>
 #include <vector>

@@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
                                 ctx.GetPlace(), layout_, library_);
 }

+class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType(ForwardOp().Type() + "_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetInput("Bias", Input("Bias"));
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -352,7 +375,7 @@ namespace ops = paddle::operators;
 // conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                  ops::Conv2DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);

 REGISTER_OP_CPU_KERNEL(
@@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL(
 // conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                  ops::Conv3DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);

 REGISTER_OP_CPU_KERNEL(
@@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL(
 // depthwise conv2d_transpose
 REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
                  ops::Conv2DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);

 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
              "Norm of the second input, reduced along the 1st "
              "dimension.")
        .AsIntermediate();
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
+        .SetDefault(true);

    AddComment(R"DOC(
 **Cosine Similarity Operator**

--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
+    auto* in_x = context.Input<framework::LoDTensor>("X");
    auto* in_y = context.Input<Tensor>("Y");
-    auto* out_z = context.Output<Tensor>("Out");
+    auto* out_z = context.Output<framework::LoDTensor>("Out");
    auto* out_x_norm = context.Output<Tensor>("XNorm");
    auto* out_y_norm = context.Output<Tensor>("YNorm");
-    out_z->mutable_data<T>(context.GetPlace());
-    out_x_norm->mutable_data<T>(context.GetPlace());
-    out_y_norm->mutable_data<T>(context.GetPlace());

    int rows_x = in_x->dims()[0];
    int rows_y = in_y->dims()[0];
+    out_z->Resize({rows_x, 1});
+    out_x_norm->Resize({rows_x, 1});
+    out_y_norm->Resize({rows_y, 1});
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
+    out_z->set_lod(in_x->lod());

    int cols = framework::product(in_x->dims()) / rows_x;

@@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {

    if (rows_x == rows_y) {
      if (out_grad_x) {
+        out_grad_x->Resize(in_x->dims());
        math::CosSimGradFunctor<T> functor(
            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
        for_range(functor);
      }
      if (out_grad_y) {
+        out_grad_y->Resize(in_y->dims());
        math::CosSimGradFunctor<T> functor(
            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
      }
    } else {
      if (out_grad_x) {
+        out_grad_x->Resize(in_x->dims());
        math::CosSimDxFunctor<T> functor(
            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
        for_range(functor);
      }
      if (out_grad_y) {
+        out_grad_y->Resize(in_y->dims());
        out_grad_y->mutable_data<T>(context.GetPlace());
        math::SetConstant<DeviceContext, T> set_zero;
        auto& dev_ctx = context.template device_context<DeviceContext>();

--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+struct MutableDataFunctor {
+  MutableDataFunctor(void** data, framework::LoDTensor* tensor,
+                     const platform::Place& place)
+      : data_(data), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void apply() {
+    *data_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** data_;
+  framework::LoDTensor* tensor_;
+  platform::Place place_;
+};
+
+class AllReduceOp : public framework::OperatorBase {
+  using OperatorBase::OperatorBase;
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE(is_gpu_place(place),
+                   "AllReduce op can run on gpu place only for now.");
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* ctx = pool.Get(place);
+    auto in_names = Inputs("X");
+    auto out_names = Outputs("Out");
+    PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input");
+    PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output");
+
+    auto* in = scope.FindVar(in_names[0]);
+    auto* out = scope.FindVar(out_names[0]);
+
+    PADDLE_ENFORCE(in->IsType<framework::LoDTensor>() ||
+                       out->IsType<framework::LoDTensor>(),
+                   "Only support allreduce LoDTensors");
+
+    int dtype = -1;
+    auto in_tensor = in->Get<framework::LoDTensor>();
+    dtype = platform::ToNCCLDataType(in_tensor.type());
+
+    int64_t numel = in_tensor.numel();
+    auto* sendbuff = in_tensor.data<void>();
+    auto* out_tensor = out->GetMutable<framework::LoDTensor>();
+    out_tensor->Resize(in_tensor.dims());
+    void* recvbuff = nullptr;
+    framework::VisitDataType(in_tensor.type(),
+                             MutableDataFunctor(&recvbuff, out_tensor, place));
+
+    auto cuda_ctx = static_cast<platform::CUDADeviceContext*>(ctx);
+    auto* comm = cuda_ctx->nccl_comm();
+    // FIXME(typhoonzero): should use nccl stream here.
+    auto stream = cuda_ctx->stream();
+
+    int reduce_type = Attr<int>("reduce_type");
+    ncclRedOp_t red_type = ncclSum;
+    switch (reduce_type) {
+      case 0:
+        red_type = ncclSum;
+        break;
+      case 1:
+        red_type = ncclProd;
+        break;
+      case 2:
+        red_type = ncclMax;
+        break;
+      case 3:
+        red_type = ncclMin;
+        break;
+    }
+
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
+        comm, stream));
+#endif
+  }
+};
+
+class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be allreduced.");
+    AddOutput("Out", "(Tensor) the result of allreduced.");
+    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+***AllReduce Operator***
+
+Call NCCL AllReduce internally. Note that this op must be used when one
+thread is managing one GPU device.
+
+For speed reasons, reduce_type should be an integer:
+
+0: sum
+1: prod
+2: max
+3: min
+
+If input and output are the same variable, in-place allreduce will be used.
+)DOC");
+  }
+};
+
+class AllReduceOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(allreduce, ops::AllReduceOp,
+                  paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker,
+                  ops::AllReduceOpShapeInference);
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/dropout_op.h"
+#include <memory>
 #include <string>

 namespace paddle {
@@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
        "1. downgrade_in_infer(default), downgrade the outcome at inference "
        "time"
        "   train: out = input * mask"
-        "   inference: out = input * dropout_prob"
+        "   inference: out = input * (1.0 - dropout_prob)"
        "2. upscale_in_train, upscale the outcome at training time, do nothing "
        "in inference"
        "   train: out = input * mask / ( 1.0 - dropout_prob )"
@@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                      "GradOp is only callable when is_test is false");

-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) must not be null.");

-    auto x_dims = ctx->GetInputDim("X");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims, out_dims,
-                      "Dimensions of Input(X) and Out@Grad must be the same.");
-    auto mask_dims = ctx->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
-                      "Dimensions of Input(X) and Mask must be the same.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+
+    ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    ctx->ShareLoD(framework::GradVarName("Out"),
+                  /*->*/ framework::GradVarName("X"));
+  }
+};
+
+class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("dropout_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Mask", Output("Mask"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
  }
 };

@@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::DropoutGradOpDescMaker);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "FloorDiv"; }
+  std::string GetEquation() const override { return "Out = X // Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
+                             ops::ElementwiseFloorDivOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct FloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_floor_div(const framework::ExecutionContext &ctx,
+                           const framework::Tensor *x,
+                           const framework::Tensor *y, framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseModOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Mod"; }
+  std::string GetEquation() const override { return "Out = X % Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
+                             ops::ElementwiseModOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mod,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_mod(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        ModFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseModKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
  }
 };

+template <typename T>
+struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    if (scale_num == 1) {
+      const int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      for (int i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = (s / max_range) * in_e;
+      }
+    } else if (scale_num == 2) {
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      for (int i = 0; i < batch_size; i++) {
+        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+            framework::slice_ddim(in->dims(), 1, in->dims().size()));
+        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+            framework::slice_ddim(out->dims(), 1, out->dims().size()));
+        for (int j = 0; j < channel; j++) {
+          T s = scale_one[j];
+          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
+        }
+      }
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
 template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;

 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:

--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
  }
 };

+template <typename T>
+__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
+                                   int num, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int batch_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (batch_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int num = in->numel();
+      int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      int block = 1024;
+      int grid = channel;
+      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, channel, out_data);
+    } else if (scale_num == 2) {
+      int num = in->numel();
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = batch_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          out_data);
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
 template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;

 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <vector>
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@@ -28,6 +29,13 @@ struct DequantizeFunctor {
                  framework::Tensor* out);
 };

+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
@@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
    auto* out = ctx.Output<framework::Tensor>("Out");

-    PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
-                      "The number of first scale values must be the same with "
-                      "first dimension value of Input(X).");
-
    auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
-    int max_range = std::pow(2, quant_bits[0] - 1) - 1;
+    int max_range = 1;

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    out->mutable_data<T>(dev_ctx.GetPlace());
-
-    auto dequant = DequantizeFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
-      dequant(dev_ctx, &one_channel_in, &one_channel_scale,
-              static_cast<T>(max_range), &one_channel_out);
-    }
-
-    if (scales.size() == 2) {
+    int scale_num = scales.size();
+    if (scale_num == 1) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[0],
+          "The number of first scale values must be the same with "
+          "first dimension value of Input(X) when the `Scales` has only one "
+          "element.");
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
+    } else if (scale_num == 2) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[1],
+          "The number of first scale values must be the same with "
+          "second dimension value of Input(X) when the `Scales` has two "
+          "elements.");
      PADDLE_ENFORCE_EQ(
          scales[1]->numel(), 1,
          "The second scale tensor should only have one value at now.");
-      max_range = std::pow(2, quant_bits[1] - 1) - 1;
-      dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
+                   (std::pow(2, quant_bits[1] - 1) - 1);
    }
+    ChannelDequantizeFunctor<DeviceContext, T>()(
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
  }
 };


--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {

 template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;

+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    const int channel_size = num / channel;
+    for (int i = 0; i < channel; i++) {
+      auto* start = in + i * channel_size;
+      auto* end = in + (i + 1) * channel_size;
+      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+    }
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,
@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {

 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;

+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const int channel_size = in.numel() / channel;
+    platform::Transform<platform::CPUDeviceContext> trans;
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      auto* start = in_data + i * channel_size;
+      auto* end = in_data + (i + 1) * channel_size;
+      trans(ctx, start, end, out_data + i * channel_size,
+            ClipFunctor<T>(-s, s));
+    }
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
+    }
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,
@@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
        ctx->HasOutput("Out"),
        "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
    PADDLE_ENFORCE(
-        ctx->HasOutput("OutScales"),
-        "Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
+        ctx->HasOutput("OutScale"),
+        "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
    ctx->ShareLoD("X", /*->*/ "Out");
  }

@@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
    AddOutput("Out",
              "(Tensor) Output of quantized low level tensor, "
              "but also saved as float data type.");
-    AddOutput("OutScales", "(Tensor) Current channel wise scale");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
        .AddCustomChecker([](const int& bit_length) {

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {

 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;

+template <typename T>
+__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
+                                        T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  extern __shared__ T shared_max_data[];
+  shared_max_data[tid] = T(0);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T tmp = fabs(in_c[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    int block = 1024;
+    int grid = channel;
+    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, channel, out);
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
+
 template <typename T>
 __global__ void ClipAndQuantKernel(const T* in, const T* scale,
                                   const int bin_cnt, const int n, T* out) {
@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,

  T s = scale[0];
  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[bid];
+    T x = in[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[i] = round(v);
+  }
+}
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
+template <typename T>
+__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
+                                          const int bin_cnt, const int n,
+                                          const int c, T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
    T v = x > s ? s : x;
    v = v < -s ? -s : v;
    v = bin_cnt / s * v;
-    out[bid] = round(v);
+    out_c[i] = round(v);
  }
 }

+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = channel;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, channel, out_data);
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
+                                               float>;
+
 template <typename T>
 __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
                                            const T* last_scale,
@@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
                                               float>;

-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
-
 }  // namespace operators
 }  // namespace paddle


--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor {
                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };

+template <typename DeviceContext, typename T>
+struct FindChannelAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num,
+                  const int channel, T* out);
+};
+
+template <typename DeviceContext, typename T>
+struct ChannelClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int channel, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");

    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
-    T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
    out->mutable_data<T>(context.GetPlace());

    int bit_length = context.Attr<int>("bit_length");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel = in->Slice(i, i + 1);
-      const T* one_channel_data = one_channel.data<T>();
-      find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
-                   &out_scales_data[i]);
-    }
-    auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
-      clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
-                 &one_channel_out);
-    }
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
+    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
  }
 };


--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/layer_norm_op.h"
+#include <memory>

 namespace paddle {
 namespace operators {
@@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
    }
    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
+                        ctx->GetInputDim("Scale"));
    }
  }

@@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
  }
 };

+class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("layer_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Mean", Output("Mean"));
+    op->SetInput("Variance", Output("Variance"));
+    if (ForwardOp().Inputs().count("Scale") > 0) {
+      op->SetInput("Scale", Input("Scale"));
+      op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    }
+
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LayerNormGradOpDescMaker);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    const float epsilon = ctx.Attr<float>("epsilon");
    auto x = *ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
    auto* mean = ctx.Input<Tensor>("Mean");
    auto* var = ctx.Input<Tensor>("Variance");
    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");

@@ -275,10 +273,6 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
      x.Resize(matrix_shape);
      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());

-      if (!(bias && scale)) {
-        temp_norm.ShareDataWith(*y);
-        temp_norm.Resize(matrix_shape);
-      } else {
      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
      // get x_norm
      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
@@ -287,7 +281,6 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
          ctx, &temp_norm, var, /*axis*/ 0,
          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
    }
-    }

    if (d_bias) {
      d_bias->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/load_combine_op.h"

 namespace paddle {
 namespace operators {

-class LoadCombineOp : public framework::OperatorBase {
+class LoadCombineOp : public framework::OperatorWithKernel {
 public:
-  LoadCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto model_from_memory = Attr<bool>("model_from_memory");
-    auto out_var_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-    if (!model_from_memory) {
-      std::ifstream fin(filename, std::ios::binary);
-      PADDLE_ENFORCE(static_cast<bool>(fin),
-                     "Cannot open file %s for load_combine op", filename);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    } else {
-      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename, std::ios::in | std::ios::binary);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    }
-  }
-  void LoadParamsFromBuffer(
-      const framework::Scope &scope, const platform::Place &place,
-      std::istream *buffer, bool load_as_fp16,
-      const std::vector<std::string> &out_var_names) const {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      auto *out_var = scope.FindVar(out_var_names[i]);
-
-      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                     out_var_names[i]);
-
-      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
-
-      // Get data from fin to tensor
-      DeserializeFromStream(*buffer, tensor, dev_ctx);
-
-      auto in_dtype = tensor->type();
-      auto out_dtype =
-          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        // convert to float16 tensor
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor fp16_tensor;
-        // copy LoD info to the new tensor
-        fp16_tensor.set_lod(tensor->lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                                 &fp16_tensor);
-
-        // reset output tensor
-        out_var->Clear();
-        tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(fp16_tensor.lod());
-        tensor->ShareDataWith(fp16_tensor);
-      }
-    }
-    buffer->peek();
-    PADDLE_ENFORCE(buffer->eof(),
-                   "You are not allowed to load partial data via "
-                   "load_combine_op, use load_op instead.");
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
  }
 };

@@ -136,9 +74,18 @@ that were saved using the SaveCombine operator.
 )DOC");
  }
 };
+
 }  // namespace operators
 }  // namespace paddle
+
 namespace ops = paddle::operators;

 REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
                  ops::LoadCombineOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_combine_op.cu
+++ b/paddle/fluid/operators/load_combine_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto model_from_memory = ctx.Attr<bool>("model_from_memory");
+    auto &out_var_names = ctx.Outputs("Out");
+
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename, std::ios::binary);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+
+  void LoadParamsFromBuffer(
+      const framework::ExecutionContext &context, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto out_vars = context.MultiOutputVar("Out");
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      PADDLE_ENFORCE(out_vars[i] != nullptr,
+                     "Output variable %s cannot be found", out_var_names[i]);
+
+      auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
+
+      // Get data from fin to tensor
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
+
+      auto in_dtype = tensor->type();
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
+        out_vars[i]->Clear();
+        tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
+      }
+    }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>

-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include <string>
+
+#include "paddle/fluid/operators/load_op.h"

 namespace paddle {
 namespace operators {

-class LoadOp : public framework::OperatorBase {
+class LoadOp : public framework::OperatorWithKernel {
 public:
-  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
+  using framework::OperatorWithKernel::OperatorWithKernel;

-    auto out_var_name = Output("Out");
-    auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Output variable %s cannot be found in scope %p",
-                   out_var_name, &scope);
+  void InferShape(framework::InferShapeContext *ctx) const override {}

-    if (out_var->IsType<framework::LoDTensor>()) {
-      LoadLodTensor(fin, place, out_var);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "Load only support LoDTensor and SelectedRows, %s has wrong type",
-          out_var_name);
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin, const platform::Place &place,
-                     framework::Variable *var) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor, dev_ctx);
-
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->type();
-    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                               &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, platform::CPUPlace());
+    return kt;
  }
 };

@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
        "file.");
  }
 };
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;

 REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_op.cu
+++ b/paddle/fluid/operators/load_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    auto filename = ctx.Attr<std::string>("file_path");
+    std::ifstream fin(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = ctx.Outputs("Out").data();
+    auto *out_var = ctx.OutputVar("Out");
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
+                   out_var_name);
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var, ctx);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
+
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var,
+                     const framework::ExecutionContext &ctx) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
+
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto in_dtype = tensor->type();
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+
+      // reset output tensor
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
+    }
+  }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_GT(level0.size(), 1,
                        "If Input(Y) not provided, the target lod should be "
                        "specified by attribute `target_lod`.");
+    } else {
+      ctx->ShareLoD("Y", "Out");
    }
+
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
  }


--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   "The numel of 'pad_value' can only be 1 or be equal to the "
                   "'step_width'.");

-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
-      pad_tensor->Resize(pad_tensor_dims);
-      return;
-    }
-
    const int kBlockSize = 512;

    /* At least use 32 threads to copy sequence_width elements,
@@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {

    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
              step_width, layout);
-
+    /*
    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
      seq_tensor->Resize(seq_tensor_dims);
      return;
    }
+    */

    const int kBlockSize = 512;


--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel {
                                     context->Attrs().Get<bool>("transpose_Y"));

    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    if (context->IsRuntime()) {
      PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
                     mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    }
    std::vector<int64_t> dim_out;
    if (mat_dim_x.batch_size_ != 0) {
      dim_out = framework::vectorize(dim_x);

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"

 namespace paddle {
 namespace operators {
@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
 }

 static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
-                                                const mkldnn::engine& engine) {
-  constexpr auto data_type = mkldnn::memory::f32;
+                                                const mkldnn::engine& engine,
+                                                const memory::data_type& dt) {
  const auto dims = paddle::framework::vectorize2int(input.dims());
  const auto format = input.format();
-  auto description = memory::desc(dims, data_type, format);
+  auto description = memory::desc(dims, dt, format);
  auto mem_prim_desc = memory::primitive_desc(description, engine);
  return mem_prim_desc;
 }

+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
+
 static platform::CPUPlace GetCpuPlace(
    const paddle::framework::ExecutionContext& ctx) {
  auto place = ctx.GetPlace();
@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine(
  return dev_ctx.GetEngine();
 }

+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const std::vector<const Tensor*> multi_input,
+                      const int64_t& concat_axis, const memory::data_type& dt) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  for (size_t i = 0; i < multi_input.size(); i++) {
+    platform::MKLDNNHandler::AppendKeyDims(
+        &key, paddle::framework::vectorize2int(multi_input[i]->dims()));
+  }
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  return key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
 public:
  concat::primitive_desc CreateConcatPrimDescriptor(
      const std::vector<const Tensor*> multi_input, Tensor* output,
-      int concat_axis, const mkldnn::engine& mkldnn_engine) {
-    CreateSourcesDescriptors(multi_input, mkldnn_engine);
-    auto dst_desc = CreateDstMemDescriptor(output);
+      int concat_axis, const mkldnn::engine& mkldnn_engine,
+      const memory::data_type& dt = memory::data_type::f32) {
+    CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
+    auto dst_desc = CreateDstMemDescriptor(output, dt);
    return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
  }

@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory {
    return concat(concat_pd, inputs, dst_mem.get());
  }

+  void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
+                               void* handler) {
+    srcs[i].set_data_handle(handler);
+  }
+
+  void SetDstDataHandle(const memory& dst_mem, void* handler) {
+    dst_mem.set_data_handle(handler);
+  }
+
+  std::vector<memory> GetSrcs() { return srcs; }
+
+  memory GetDst() { return dst_mem.get(); }
+
 private:
-  memory::desc CreateDstMemDescriptor(Tensor* output) {
+  memory::desc CreateDstMemDescriptor(Tensor* output,
+                                      const memory::data_type& dt) {
    auto dst_dims = paddle::framework::vectorize2int(output->dims());
-    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
-                        memory::format::any);
+    return memory::desc(dst_dims, dt, memory::format::any);
  }

  mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
-                                 Tensor* output, platform::CPUPlace place) {
+                                 Tensor* output,
+                                 const platform::CPUPlace& place) {
    return memory(concat_pd.dst_primitive_desc(),
                  output->mutable_data<T>(place));
  }

  void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
-                                const mkldnn::engine& mkldnn_engine) {
+                                const mkldnn::engine& mkldnn_engine,
+                                const memory::data_type& dt) {
    for (size_t i = 0; i < multi_input.size(); i++) {
-      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      auto mem_prim_desc =
+          CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
      srcs_pd.push_back(mem_prim_desc);
      srcs.push_back(
          memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
@@ -120,21 +158,59 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto place = GetCpuPlace(ctx);
-    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
-
    auto multi_input = ctx.MultiInput<Tensor>("X");
    EnforceLayouts(multi_input);
    Tensor* output = ctx.Output<Tensor>("Out");
    int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    auto place = GetCpuPlace(ctx);
+
+    memory::data_type dt =
+        paddle::framework::ToMKLDNNDataType(multi_input[0]->type());

    ConcatPrimitiveFactory<T> prim_creator;
-    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
-        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
-    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
-    stream(stream::kind::eager).submit({concat}).wait();
+    std::string key = CreateKey(ctx, multi_input, concat_axis, dt);
+    const std::string key_prim = key + "@concat_p";
+    const std::string key_concat_pd = key + "@concat_pd";
+    const std::string key_srcs = key + "@concat_srcs";
+    const std::string key_dst = key + "@concat_dst";
+
+    std::shared_ptr<concat::primitive_desc> concat_pd;
+    std::shared_ptr<std::vector<memory>> srcs;
+    std::shared_ptr<memory> dst_mem;
+    auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
+
+    if (concat_p == nullptr) {
+      const auto& mkldnn_engine = dev_ctx.GetEngine();
+      concat_pd = std::make_shared<concat::primitive_desc>(
+          prim_creator.CreateConcatPrimDescriptor(multi_input, output,
+                                                  static_cast<int>(concat_axis),
+                                                  mkldnn_engine, dt));
+      concat_p = std::make_shared<concat>(
+          prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
+      srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
+      dst_mem = std::make_shared<memory>(prim_creator.GetDst());
+      dev_ctx.SetBlob(key_prim, concat_p);
+      dev_ctx.SetBlob(key_concat_pd, concat_pd);
+      dev_ctx.SetBlob(key_srcs, srcs);
+      dev_ctx.SetBlob(key_dst, dst_mem);
+    } else {
+      srcs = std::static_pointer_cast<std::vector<memory>>(
+          dev_ctx.GetBlob(key_srcs));
+      dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
+      concat_pd = std::static_pointer_cast<concat::primitive_desc>(
+          dev_ctx.GetBlob(key_concat_pd));
+      for (size_t i = 0; i < multi_input.size(); i++) {
+        prim_creator.SetSrcDataHandleByIndex(
+            *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
+      }
+      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+    }
+
+    stream(stream::kind::eager).submit({*concat_p}).wait();

-    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
+    output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
  }
 };
 }  // namespace operators
@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;

 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>)
+                   ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<int8_t>,
+                   ops::ConcatMKLDNNOpKernel<uint8_t>);
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -92,12 +92,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals(

  int size = ops->size();
  int left = 0;
-  while (left < size && ops->at(left)->Type() != framework::kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != framework::kFetchOpType) {
    ++left;
  }
-  if (left == size) {
-    return intervals;
-  }

  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
    for (auto& var_name_item : ops->at(left)->Outputs()) {
@@ -112,10 +110,6 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
  while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
    ++right;
  }
-  if (right == size) {
-    return intervals;
-  }
-  if (left >= right) return intervals;

  int index = right;
  while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
@@ -127,6 +121,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
    ++index;
  }

+  if (left == size || ops->at(left)->Type() == framework::kFetchOpType) {
+    left = 0;
+  }
+
  // (left, right - 1) represents indices between feed and fetch
  int pivot = left;
  while (pivot < right) {
@@ -234,6 +232,7 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope,
 }

 void NgraphEngine::Prepare(const std::vector<int>& interval) {
+  bool has_fetch = false, is_full = false;
  for (auto& var : p_bdesc->AllVars()) {
    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
@@ -264,6 +263,9 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
  std::vector<paddle::framework::OpDesc*> ops_desc;
  for (auto op_desc : p_bdesc->AllOps()) {
    ops_desc.emplace_back(op_desc);
+    if (op_desc->Type() == framework::kFetchOpType) {
+      has_fetch = true;
+    }
  }

  for (auto op_desc : ops_desc) {
@@ -276,11 +278,11 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
  if (interval[0] > 0 &&
      ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
      interval[1] < static_cast<int>(ops_desc.size()) &&
-      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
-    this->op_state_ = OpState::FULL;
+      ops_desc.at(interval[1])->Type() == framework::kFetchOpType) {
+    is_full = true;
  }

-  if (this->op_state_ == OpState::FULL) {
+  if (is_full) {
    this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
  } else {
    this->op_state_ =
@@ -293,7 +295,8 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
        framework::OpRegistry::CreateOp(*(ops_desc[idx])));
    ++idx;
  }
-  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() != framework::kFetchOpType) {
    auto op_desc = ops_desc.at(idx);
    for (auto& var_name_item : op_desc->Inputs()) {
      for (auto& var_name : var_name_item.second) {
@@ -303,6 +306,10 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
    ++idx;
  }

+  if (!has_fetch) {
+    op_state_ = OpState::UNKNOWN;
+  }
+
  BuildNgIO(ops_desc, interval);
 }

@@ -318,7 +325,8 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
        const bool is_output = outputs.find(var_name) != outputs.end();
        if (!is_output &&
            std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end()) {
+                var_in_.end() &&
+            scope_.FindVar(var_name)) {
          // fill var_in here to keep lhs and rhs order
          this->var_in_.emplace_back(var_name);
        }
@@ -378,6 +386,7 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
      }
    }
  }
+
  for (size_t i = 0; i < var_in_.size(); ++i) {
    auto var_name = var_in_[i];
    if (persistables_.find(var_name) == persistables_.end()) {

--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
-#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
+#pragma once
+
 #include <memory>
 #include <set>
 #include <string>
@@ -35,7 +35,6 @@ enum class OpState {                /* nGraph support state on ops          */
                     PARTIAL_TRAIN, /* Support partial ops for train        */
                     FULL_TEST,     /* Support full list of ops for test    */
                     PARTIAL_TEST,  /* Support partial list of ops for test */
-                     FULL,          /* All ops supported from feed to fetch */
                     UNKNOWN        /* Output all for debug purpose         */
 };

@@ -119,4 +118,3 @@ class NgraphEngine {

 }  // namespace operators
 }  // namespace paddle
-#endif  // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -27,13 +27,9 @@ namespace paddle {
 namespace operators {
 namespace ngraphs {

-void BuildCrossEntropyNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetCrossEntropy(
+    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
+    const bool is_soft_label, int ignore_index) {
  auto label_shape = label->get_shape();
  auto x_shape = x->get_shape();
  auto label_rank = label_shape.size();
@@ -46,18 +42,16 @@ void BuildCrossEntropyNode(
    label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
  }
  if (x_rank > 2) {
-    x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1);
-    x_2d = paddle::platform::NgReshaper(x, x_2d_shape);
+    x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = platform::NgReshaper(x, x_2d_shape);
  }

  auto batch_size = x_2d_shape.at(0);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");

  std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
  if (!is_soft_label) {
-    auto label_1d = paddle::platform::NgReshaper(
-        label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    auto label_1d =
+        platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)});
    node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
  }
  if (x->get_element_type() != node_1_hot->get_element_type()) {
@@ -76,11 +70,9 @@ void BuildCrossEntropyNode(
  auto node_sum =
      std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
  auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
-  auto xe =
-      paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+  auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});

  if (!is_soft_label) {
-    auto ignore_index = op_attrs.Get<int>("ignore_index");
    auto ignore_node = ngraph::op::Constant::create(
        label->get_element_type(), label_2d_shape, {ignore_index});
    auto not_equal_node =
@@ -89,21 +81,13 @@ void BuildCrossEntropyNode(
                                                      xe->get_element_type());
    xe = xe * mask;
  }
-
-  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+  return xe;
 }

-void BuildCrossEntropyGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetCrossEntropyGrad(
+    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
+    std::shared_ptr<ngraph::Node> dy, const bool is_soft_label,
+    int ignore_index) {
  auto x_shape = x->get_shape();
  auto rank = x_shape.size();

@@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode(
  if (!is_soft_label) {
    auto label_shape = label->get_shape();
    label_shape.pop_back();
-    label = paddle::platform::NgReshaper(label, label_shape);
+    label = platform::NgReshaper(label, label_shape);

-    auto ignore_index = op_attrs.Get<int>("ignore_index");
    auto ignore_node = ngraph::op::Constant::create(
        label->get_element_type(), label_shape, {ignore_index});
    auto not_equal_node =
@@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode(

  auto dy_shape = dy->get_shape();
  dy_shape.pop_back();
-  auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape);
+  auto dy_reshape = platform::NgReshaper(dy, dy_shape);
  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
      dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
  if (x->get_element_type() != label->get_element_type()) {
@@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode(
  if (!is_soft_label) {
    xe_grad = xe_grad * mask;
  }
+  return xe_grad;
+}

+void BuildCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  int ignore_index = op_attrs.Get<int>("ignore_index");
+  auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index);
+  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+}
+
+void BuildCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  int ignore_index = op_attrs.Get<int>("ignore_index");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index);
  paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
 }
 }  // namespace ngraphs

--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -27,12 +27,7 @@ namespace paddle {
 namespace operators {
 namespace ngraphs {

-void BuildSoftmaxNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetSoftmax(std::shared_ptr<ngraph::Node> x) {
  auto x_shape = x->get_shape();
  int rank = x_shape.size();
  auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1);
@@ -47,16 +42,11 @@ void BuildSoftmaxNode(
          -64., x_shifted);
  auto softmax =
      std::make_shared<ngraph::op::Softmax>(x_clipped, ngraph::AxisSet{1});
-  paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
+  return softmax;
 }

-void BuildSoftmaxGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetSoftmaxGrad(
+    std::shared_ptr<ngraph::Node> out, std::shared_ptr<ngraph::Node> dout) {
  auto out_shape = out->get_shape();
  int rank = out_shape.size();
  auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1);
@@ -70,6 +60,27 @@ void BuildSoftmaxGradNode(
  auto node_bcast = std::make_shared<ngraph::op::Broadcast>(
      node_sum, out_2d_shape, ngraph::AxisSet{1});
  auto dx = (dout - node_bcast) * out;
+  return dx;
+}
+
+void BuildSoftmaxNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto softmax = GetSoftmax(x);
+  paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
+}
+
+void BuildSoftmaxGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto dx = GetSoftmaxGrad(out, dout);
  paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
 }
 }  // namespace ngraphs

--- a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/save_op.cu
+++ b/paddle/fluid/operators/save_op.cu
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#if !defined(__APPLE__) && !defined(_WIN32)
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #endif


--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
--- a/python/paddle/fluid/imperative/checkpoint.py
+++ b/python/paddle/fluid/imperative/checkpoint.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py