support the fusion of batch_norm and relu for AMP. test=release/1.7 (#22210)

c63a63d5 · Zhen Wang · Zeng Jinle · fedb609d · c63a63d5 · c63a63d5
17 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -118,7 +118,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -100,7 +100,7 @@ endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass 
+        fuse_elewise_add_act_pass fuse_bn_act_pass multi_batch_merge_pass
        fuse_relu_depthwise_conv_pass
        lock_free_optimize_pass
        coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -167,6 +167,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                        "fuse_relu_depthwise_conv_pass");
    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                        "fuse_elewise_add_act_pass");
+    AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
    // for single card training, fuse_all_reduce_ops is unnecessary.
    // coalesce_grad_tensor_pass should be before of MultiDevPass.
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
@@ -369,6 +370,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                        "GPU, skipped.";
        continue;
      }
+    } else if (pass->Type() == "fuse_bn_act_pass") {
+      if (!use_cuda) {
+        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
+                        "GPU, skipped.";
+        continue;
+      }
    } else if (pass->Type() == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
@@ -394,6 +401,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
 USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(fuse_bn_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -87,6 +87,7 @@ struct BuildStrategy {
  // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
  // cycle.
  bool fuse_elewise_add_act_ops_{false};
+  bool fuse_bn_act_ops_{false};
  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
  // should not be sparse types
  boost::optional<bool> fuse_all_optimizer_ops_{false};

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -106,6 +106,7 @@ if(WITH_NGRAPH)
    set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "")
 endif()

+cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )


--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
+#ifdef PADDLE_WITH_CUDA
+#if CUDNN_VERSION_MIN(7, 4, 1)
+  // forward
+  std::unordered_set<std::string> act_types = {"relu"};
+  graph = FuseBatchNormAct(graph, act_types);
+  // backward
+  std::unordered_set<std::string> act_grad_types = {"relu_grad"};
+  graph = FuseBatchNormActGrad(graph, act_grad_types);
+#endif
+#endif
+}
+
+// act(bn(x))
+ir::Graph *FuseBatchNormActPass::FuseBatchNormAct(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument(
+                 "The input graph of FuseBatchNormAct should not be nullptr."));
+  FusePassBase::Init("bn_act", graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("bn_act/x")
+                ->AsInput()
+                ->assert_is_op_input("batch_norm", "X")
+                ->assert_var_dtype(proto::VarType::FP16);
+  patterns::BatchNormAct bn_act_pattern(gpd.mutable_pattern(), "bn_act");
+
+  bn_act_pattern(x, act_types);
+
+  int found_bn_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseBatchNormAct fuse";
+    // BN inputs
+    GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, bn_act_pattern);
+    // BN outputs
+    GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance,
+                              bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space,
+                              bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, bn_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);
+
+    std::string bn_x_n = subgraph.at(x)->Name();
+    std::string bn_scale_n = bn_scale->Name();
+    std::string bn_bias_n = bn_bias->Name();
+    std::string bn_variance_n = bn_variance->Name();
+    std::string bn_mean_n = bn_mean->Name();
+    std::string bn_mean_out_n = bn_mean_out->Name();
+    std::string bn_variance_out_n = bn_variance_out->Name();
+    std::string bn_saved_variance_n = bn_saved_variance->Name();
+    std::string bn_saved_mean_n = bn_saved_mean->Name();
+    std::string bn_reserve_space_n = bn_reserve_space->Name();
+    std::string bn_out_n = bn_out->Name();
+    std::string act_out_n = act_out->Name();
+
+    Node *fused_bn_act_node = CreateFusedBatchNormActNode(
+        g, act, batch_norm, bn_x_n, bn_scale_n, bn_bias_n, bn_variance_n,
+        bn_mean_n, bn_mean_out_n, bn_variance_out_n, bn_saved_variance_n,
+        bn_saved_mean_n, bn_reserve_space_n, act_out_n);
+
+    VLOG(4) << "\n\t " << bn_x_n << ", " << bn_scale_n << ", " << bn_bias_n
+            << ", " << bn_variance_n << " and " << bn_mean_n << " -> "
+            << batch_norm->Name() << " -> " << bn_mean_out_n << ", "
+            << bn_variance_out_n << ", " << bn_saved_variance_n << ", "
+            << bn_saved_mean_n << ", " << bn_reserve_space_n << " and "
+            << bn_out_n << "\n"
+            << "\t " << bn_out_n << " -> " << act->Name() << " -> "
+            << act_out_n;
+
+    ReLinkNodes(g, bn_out, batch_norm, act, fused_bn_act_node);
+    found_bn_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_bn_act_count);
+  return graph;
+}
+
+Node *FuseBatchNormActPass::CreateFusedBatchNormActNode(
+    Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n,
+    const std::string &bn_scale_n, const std::string &bn_bias_n,
+    const std::string &bn_variance_n, const std::string &bn_mean_n,
+    const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
+    const std::string &bn_saved_variance_n, const std::string &bn_saved_mean_n,
+    const std::string &bn_reserve_space_n, const std::string &act_out_n) const {
+  OpDesc desc;
+  desc.SetInput("X", std::vector<std::string>({bn_x_n}));
+  desc.SetInput("Scale", std::vector<std::string>({bn_scale_n}));
+  desc.SetInput("Bias", std::vector<std::string>({bn_bias_n}));
+  desc.SetInput("Mean", std::vector<std::string>({bn_mean_n}));
+  desc.SetInput("Variance", std::vector<std::string>({bn_variance_n}));
+
+  desc.SetOutput("Y", std::vector<std::string>({act_out_n}));
+  desc.SetOutput("MeanOut", std::vector<std::string>({bn_mean_out_n}));
+  desc.SetOutput("VarianceOut", std::vector<std::string>({bn_variance_out_n}));
+  desc.SetOutput("SavedMean", std::vector<std::string>({bn_saved_mean_n}));
+  desc.SetOutput("SavedVariance",
+                 std::vector<std::string>({bn_saved_variance_n}));
+  desc.SetOutput("ReserveSpace",
+                 std::vector<std::string>({bn_reserve_space_n}));
+  desc.SetType("fused_batch_norm_act");
+
+  desc.SetAttr("act_type", act->Name());
+  // Set attrs
+  for (auto &n : {act->Op(), bn->Op()}) {
+    for (auto &m : n->GetAttrMap()) {
+      desc.SetAttr(m.first, m.second);
+    }
+  }
+
+  auto fused_bn_act_node = g->CreateOpNode(&desc);
+  return fused_bn_act_node;
+}
+
+// the backward of act(bn(x))
+// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
+// bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
+// "ReserveSpace"],
+// out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
+ir::Graph *FuseBatchNormActPass::FuseBatchNormActGrad(
+    ir::Graph *graph,
+    const std::unordered_set<std::string> &act_grad_types) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph,
+      platform::errors::InvalidArgument(
+          "The input graph of FuseBatchNormActGrad should not be nullptr."));
+  FusePassBase::Init("bn_act_grad", graph);
+
+  GraphPatternDetector gpd;
+  auto *d_act_out =
+      gpd.mutable_pattern()
+          ->NewNode("bn_act_grad/x")
+          ->AsInput()
+          ->assert_is_ops_input(act_grad_types, GradVarName("Out"));
+  patterns::BatchNormActGrad bn_act_grad_pattern(gpd.mutable_pattern(),
+                                                 "bn_act_grad");
+  bn_act_grad_pattern(d_act_out, act_grad_types);
+
+  int found_bn_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseBatchNormActGrad fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(batch_norm_grad, batch_norm_grad,
+                              bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
+                              bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_x, bn_x, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean,
+                              bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance,
+                              bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space,
+                              bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_x, d_bn_x, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_scale, d_bn_scale, bn_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_bias, d_bn_bias, bn_act_grad_pattern);
+
+    std::string d_act_out_n = subgraph.at(d_act_out)->Name();  // Y@GRAD
+    std::string act_out_n = act_out->Name();                   // Y
+    std::string d_itermediate_out_n = d_itermediate_out->Name();
+    std::string bn_x_n = bn_x->Name();
+    std::string bn_scale_n = bn_scale->Name();
+    std::string bn_bias_n = bn_bias->Name();
+    std::string bn_saved_mean_n = bn_saved_mean->Name();
+    std::string bn_saved_variance_n = bn_saved_variance->Name();
+    std::string bn_reserve_space_n = bn_reserve_space->Name();
+    std::string d_bn_x_n = d_bn_x->Name();
+    std::string d_bn_scale_n = d_bn_scale->Name();
+    std::string d_bn_bias_n = d_bn_bias->Name();
+
+    OpDesc desc;
+    desc.SetType("fused_batch_norm_act_grad");
+    desc.SetInput("X", {bn_x_n});
+    desc.SetInput("Y", std::vector<std::string>({act_out_n}));
+    desc.SetInput(GradVarName("Y"), std::vector<std::string>({d_act_out_n}));
+    desc.SetInput("Scale", std::vector<std::string>({bn_scale_n}));
+    desc.SetInput("Bias", std::vector<std::string>({bn_bias_n}));
+    desc.SetInput("SavedMean", std::vector<std::string>({bn_saved_mean_n}));
+    desc.SetInput("SavedVariance",
+                  std::vector<std::string>({bn_saved_variance_n}));
+    desc.SetInput("ReserveSpace",
+                  std::vector<std::string>({bn_reserve_space_n}));
+    desc.SetOutput(GradVarName("X"), std::vector<std::string>({d_bn_x_n}));
+    desc.SetOutput(GradVarName("Scale"),
+                   std::vector<std::string>({d_bn_scale_n}));
+    desc.SetOutput(GradVarName("Bias"),
+                   std::vector<std::string>({d_bn_bias_n}));
+    std::string act = act_grad->Name();
+    act = act.substr(0, act.length() - 5);  // remove "_grad"
+    desc.SetAttr("act_type", act);
+
+    for (auto &n : {act_grad->Op(), batch_norm_grad->Op()}) {
+      for (auto &m : n->GetAttrMap()) {
+        desc.SetAttr(m.first, m.second);
+      }
+    }
+
+    auto fused_node = g->CreateOpNode(&desc);
+
+    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
+            << bn_x_n << ", " << d_itermediate_out_n << ", " << bn_scale_n
+            << ", " << bn_bias_n << ", " << bn_saved_mean_n << ", "
+            << bn_saved_variance_n << " and " << bn_reserve_space_n << " -> "
+            << batch_norm_grad->Name() << " -> " << d_bn_x_n << ", "
+            << d_bn_scale_n << " and " << d_bn_bias_n;
+
+    ReLinkNodes(g, d_itermediate_out, act_grad, batch_norm_grad, fused_node);
+    found_bn_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_bn_act_count);
+  return graph;
+}
+
+void FuseBatchNormActPass::ReLinkNodes(Graph *graph,
+                                       const Node *intermediate_out, Node *op_1,
+                                       Node *op_2,
+                                       Node *fused_op) const {  // delete act
+  for (auto &in : op_1->inputs) {
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs);
+  }
+
+  std::unordered_set<const Node *> nodes2delete;
+  for (auto &out : op_1->outputs) {
+    // intermediate_out or ctr_var
+    auto result_iter =
+        std::find_if(op_2->inputs.begin(), op_2->inputs.end(),
+                     [&out](const Node *node) -> bool { return node == out; });
+
+    if (result_iter == op_2->inputs.end()) {
+      IR_OP_VAR_LINK(fused_op, out);
+    } else {
+      nodes2delete.emplace(out);
+    }
+  }
+
+  for (auto &in : op_2->inputs) {
+    if (in == intermediate_out || nodes2delete.count(in)) {
+      continue;
+    }
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs);
+  }
+
+  for (auto &out : op_2->outputs) {
+    IR_OP_VAR_LINK(fused_op, out);
+  }
+
+  nodes2delete.insert(std::move(op_1));
+  nodes2delete.insert(std::move(op_2));
+
+  GraphSafeRemoveNodes(graph, nodes2delete);
+}
+
+std::vector<Node *> FuseBatchNormActPass::ReplaceNode(
+    Node *cur_node, Node *new_node, const std::vector<Node *> &nodes) const {
+  std::vector<Node *> new_list(nodes.size());
+  bool has_replaced = false;
+  std::transform(nodes.begin(), nodes.end(), new_list.begin(),
+                 [&](Node *node) -> Node * {
+                   if (node == cur_node) {
+                     has_replaced = true;
+                     return new_node;
+                   }
+                   return node;
+                 });
+  PADDLE_ENFORCE_EQ(has_replaced, true,
+                    platform::errors::NotFound("Not find %s in the node list.",
+                                               cur_node->Name()));
+  return new_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_bn_act_pass, paddle::framework::ir::FuseBatchNormActPass);
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the BatchNorm and activation.
+ */
+class FuseBatchNormActPass : public FusePassBase {
+ public:
+  virtual ~FuseBatchNormActPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseBatchNormAct(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
+
+  ir::Graph *FuseBatchNormActGrad(
+      ir::Graph *graph,
+      const std::unordered_set<std::string> &act_grad_types) const;
+
+  std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
+                                  const std::vector<Node *> &nodes) const;
+
+  void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
+                   Node *op_2, Node *fused_op) const;
+  Node *CreateFusedBatchNormActNode(
+      Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n,
+      const std::string &bn_scale_n, const std::string &bn_bias_n,
+      const std::string &bn_variance_n, const std::string &bn_mean_n,
+      const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
+      const std::string &bn_saved_variance_n,
+      const std::string &bn_saved_mean_n, const std::string &bn_reserve_space_n,
+      const std::string &act_out_n) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -383,6 +383,13 @@ PDNode *PDNode::assert_is_var() {
  return this;
 }

+PDNode *PDNode::assert_var_dtype(proto::VarType::Type dtype) {
+  assert_is_var();
+  asserts_.emplace_back(
+      [dtype](Node *x) { return x->Var()->GetDataType() == dtype; });
+  return this;
+}
+
 PDNode *PDNode::assert_is_not_ctrl_var() {
  asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); });
  return this;
@@ -476,6 +483,7 @@ PDNode *PDNode::assert_is_op_output(const std::string &op_type,
  assert_is_op_nth_output(op_type, argument, 0);
  return this;
 }
+
 PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
  assert_is_var();
  asserts_.emplace_back([=](Node *x) {
@@ -489,6 +497,16 @@ PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
  return this;
 }

+PDNode *PDNode::assert_is_not_op_input(const std::string &argument) {
+  assert_is_op();
+  asserts_.emplace_back([=](Node *x) {
+    auto &ins = x->Op()->Inputs();
+    auto iter = ins.find(argument);
+    return iter == ins.end() || iter->second.empty();
+  });
+  return this;
+}
+
 PDNode *PDNode::assert_is_op_input(const std::string &op_type,
                                   const std::string &argument) {
  assert_is_var();
@@ -1048,6 +1066,117 @@ PDNode *patterns::ActElewiseAdd::operator()(
  return elewise_add_out;
 }

+PDNode *patterns::BatchNormAct::operator()(
+    paddle::framework::ir::PDNode *bn_x_var,
+    std::unordered_set<std::string> act_types) {
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->assert_is_op_input("batch_norm", "Scale");
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->assert_is_op_input("batch_norm", "Bias");
+  auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
+                              ->assert_is_op_input("batch_norm", "Variance");
+  auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
+                          ->assert_is_op_input("batch_norm", "Mean");
+
+  auto *bn = pattern->NewNode(batch_norm_repr())
+                 ->assert_is_op("batch_norm")
+                 ->assert_is_not_op_input("MomentumTensor")
+                 ->assert_op_attr<bool>("is_test", false)
+                 ->assert_op_attr<bool>("use_global_stats", false)
+                 ->assert_op_attr<std::string>("data_layout", "NHWC");
+
+  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
+                              ->assert_is_op_output("batch_norm", "MeanOut");
+  auto *bn_variance_out_var =
+      pattern->NewNode(bn_variance_out_repr())
+          ->assert_is_op_output("batch_norm", "VarianceOut");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->assert_is_op_output("batch_norm", "SavedVariance");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->assert_is_op_output("batch_norm", "SavedMean");
+  auto *bn_reserve_space =
+      pattern->NewNode(bn_reserve_space_repr())
+          ->assert_is_op_output("batch_norm", "ReserveSpace");
+  auto *bn_out_var = pattern->NewNode(bn_out_repr())
+                         ->assert_is_op_output("batch_norm", "Y")
+                         ->assert_has_n_outputs(1);
+
+  bn_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  bn->LinksFrom(
+        {bn_x_var, bn_scale_var, bn_bias_var, bn_variance_var, bn_mean_var})
+      .LinksTo({bn_mean_out_var, bn_variance_out_var, bn_saved_variance_var,
+                bn_saved_mean_var, bn_reserve_space, bn_out_var});
+  act->LinksFrom({bn_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::BatchNormActGrad::operator()(
+    paddle::framework::ir::PDNode *d_act_out_var,
+    std::unordered_set<std::string> act_grad_types) {
+  auto *act_grad =
+      pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+  auto *bn_grad = pattern->NewNode(batch_norm_grad_repr())
+                      ->assert_is_op("batch_norm_grad")
+                      ->assert_op_attr<bool>("use_global_stats", false)
+                      ->assert_op_attr<std::string>("data_layout", "NHWC");
+
+  auto *act_out_var = pattern->NewNode(act_out_repr())
+                          ->assert_is_ops_input(act_grad_types, "Out");
+  auto *d_intermediate_var =
+      pattern->NewNode(d_itermediate_out_repr())
+          ->assert_is_ops_output(act_grad_types, GradVarName("X"))
+          ->assert_has_n_outputs(1);
+  auto *bn_x_var = pattern->NewNode(bn_x_repr())
+                       ->assert_is_op_input("batch_norm_grad", "X")
+                       ->assert_var_dtype(proto::VarType::FP16);
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->assert_is_op_input("batch_norm_grad", "Scale");
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->assert_is_op_input("batch_norm_grad", "Bias");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->assert_is_op_input("batch_norm_grad", "SavedMean");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->assert_is_op_input("batch_norm_grad", "SavedVariance");
+  // ReserveSpace as the output is equal to:
+  // data_layout == 'NHWC' && FLAGS_cudnn_batchnorm_spatial_persistent == true
+  auto *bn_reserve_space =
+      pattern->NewNode(bn_reserve_space_repr())
+          ->assert_is_op_input("batch_norm_grad", "ReserveSpace");
+  auto *d_bn_x_var =
+      pattern->NewNode(d_bn_x_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("X"));
+  auto *d_bn_scale_var =
+      pattern->NewNode(d_bn_scale_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("Scale"));
+  auto *d_bn_bias_var =
+      pattern->NewNode(d_bn_bias_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("Bias"));
+
+  act_grad->LinksFrom({d_act_out_var, act_out_var})
+      .LinksTo({d_intermediate_var});
+
+  bn_grad
+      ->LinksFrom({bn_x_var, d_intermediate_var, bn_scale_var, bn_bias_var,
+                   bn_saved_mean_var, bn_saved_variance_var, bn_reserve_space})
+      .LinksTo({d_bn_x_var, d_bn_scale_var, d_bn_bias_var});
+
+  return bn_grad;
+}
+
 PDNode *patterns::ElewiseAddAct::operator()(
    paddle::framework::ir::PDNode *ele_x_var,
    std::unordered_set<std::string> act_types) {

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -27,6 +27,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -100,6 +101,7 @@ struct PDNode {
  PDNode* assert_is_op();
  PDNode* assert_is_op(const std::string& op_type);
  PDNode* assert_is_var();
+  PDNode* assert_var_dtype(proto::VarType::Type dtype);
  PDNode* assert_is_not_ctrl_var();
  PDNode* assert_var_not_persistable();
  PDNode* assert_is_persistable_var();
@@ -111,6 +113,7 @@ struct PDNode {
                             const std::string& argument);
  PDNode* assert_is_op_nth_input(const std::string& op_type,
                                 const std::string& argument, int nth);
+  PDNode* assert_is_not_op_input(const std::string& argument);
  PDNode* assert_is_op_nth_output(const std::string& op_type,
                                  const std::string& argument, int nth);
  PDNode* assert_is_only_input_of_op(const std::string& op_type);
@@ -590,6 +593,64 @@ struct GRU : public PatternBase {
  PATTERN_DECL_NODE(Hidden);
 };

+// The following pattern is used to fuse batch_norm and act
+// formula: act(bn(x))
+// op: batch_norm + act
+struct BatchNormAct : public PatternBase {
+  BatchNormAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bn_act") {}
+
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(batch_norm);
+  PATTERN_DECL_NODE(act);
+  // declare variable node's name
+  // BN inputs
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_variance);
+  PATTERN_DECL_NODE(bn_mean);
+  // BN outputs
+  PATTERN_DECL_NODE(bn_mean_out);
+  PATTERN_DECL_NODE(bn_variance_out);
+  PATTERN_DECL_NODE(bn_saved_variance);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_reserve_space);
+  PATTERN_DECL_NODE(bn_out);
+  // ACT output
+  PATTERN_DECL_NODE(act_out);
+};
+
+// the backward of act(bn(x))
+// op: batch_norm_grad + act_grad
+struct BatchNormActGrad : public PatternBase {
+  BatchNormActGrad(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bn_act_grad") {}
+
+  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
+  // bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
+  // "ReserveSpace"],
+  // out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> act_grad_types);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(act_grad);
+  PATTERN_DECL_NODE(batch_norm_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(act_out);
+  PATTERN_DECL_NODE(d_itermediate_out);
+  PATTERN_DECL_NODE(bn_x);
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_saved_variance);
+  PATTERN_DECL_NODE(bn_reserve_space);
+  PATTERN_DECL_NODE(d_bn_x);
+  PATTERN_DECL_NODE(d_bn_scale);
+  PATTERN_DECL_NODE(d_bn_bias);
+};
+
 // The following patterns are used to fuse elewise_add and act
 // formula: act(ele_add(x, y))
 // op: elementwise_add + act

--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -30,6 +30,8 @@ const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
    "auc",
    "batch_norm",
    "batch_norm_grad",
+    "fused_batch_norm_act",
+    "fused_batch_norm_act_grad",
    "sync_batch_norm_grad",
    "center_loss_grad",
    "crop",

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
 include(operators)
 register_operators(EXCLUDES
+    fused_bn_activation_op
    conv_fusion_op
    fusion_transpose_flatten_concat_op
    fusion_conv_inception_op
@@ -8,6 +9,11 @@ register_operators(EXCLUDES
    fusion_group_op)

 if (WITH_GPU)
+    # fused_bn_activation_op needs cudnn 7.4.1 above
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+        op_library(fused_bn_activation_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n")
+    endif()
    # conv_fusion_op needs cudnn 7 above
    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
        op_library(conv_fusion_op)

--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(X) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(Scale) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(Bias) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Mean"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(Mean) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Variance"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(Variance) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
+                    platform::errors::InvalidArgument(
+                        "Output(Y) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("MeanOut"), true,
+                    platform::errors::InvalidArgument(
+                        "Output(MeanOut) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("VarianceOut"), true,
+      platform::errors::InvalidArgument(
+          "Output(VarianceOut) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("SavedMean"), true,
+      platform::errors::InvalidArgument(
+          "Output(SavedMean) of BatchNormOp should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("SavedVariance"), true,
+      platform::errors::InvalidArgument(
+          "Output(SavedVariance) of BatchNormOp should not be null."));
+
+  // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                    platform::errors::PreconditionNotMet(
+                        "Mean and MeanOut should share the same memory"));
+  PADDLE_ENFORCE_EQ(
+      ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
+      platform::errors::PreconditionNotMet(
+          "Variance and VarianceOut should share the same memory"));
+
+  const auto x_dims = ctx->GetInputDim("X");
+
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::PreconditionNotMet(
+                                          "ShapeError: the dimension of input "
+                                          "X must greater than or equal to 2."
+                                          "But received: the shape of input X "
+                                          "= [%s], the dimension of input X ="
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::PreconditionNotMet(
+                                          "ShapeError: the dimension of input "
+                                          "X must smaller than or equal to 5."
+                                          "But received: the shape of input X "
+                                          "= [%s], the dimension of input X ="
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::PreconditionNotMet(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::PreconditionNotMet(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::PreconditionNotMet(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::PreconditionNotMet(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should both be float. (For float or float16 input tensor)
+  // or double (For double input tensor).
+  auto bn_param_type = framework::proto::VarType::FP32;
+  if (input_data_type == framework::proto::VarType::FP64) {
+    bn_param_type = framework::proto::VarType::FP64;
+  }
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+                    platform::errors::PreconditionNotMet(
+                        "Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+                    platform::errors::PreconditionNotMet(
+                        "Bias input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
+                    platform::errors::PreconditionNotMet(
+                        "Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
+                    platform::errors::PreconditionNotMet(
+                        "Variance input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+framework::OpKernelType FusedBatchNormActOp::GetKernelTypeForVar(
+    const std::string &var_name, const Tensor &tensor,
+    const framework::OpKernelType &expected_kernel_type) const {
+  return framework::OpKernelType(expected_kernel_type.data_type_,
+                                 tensor.place(), tensor.layout());
+}
+
+void FusedBatchNormActOpMaker::Make() {
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddInput("X", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Mean",
+           "The global mean (for training) or "
+           "estimated mean (for testing)");
+  AddInput("Variance",
+           "The global variance (for training) "
+           "or estimated Variance (for testing)");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("X"), true,
+      platform::errors::InvalidArgument("Input(X) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("Scale"), true,
+      platform::errors::InvalidArgument("Input(Scale) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput(framework::GradVarName("Y")), true,
+      platform::errors::InvalidArgument("Input(Y@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(SavedMean) should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true,
+                    platform::errors::InvalidArgument(
+                        "Input(SavedVariance) should not be null"));
+
+  // check output
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput(framework::GradVarName("X")), true,
+      platform::errors::InvalidArgument("Output(X@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Scale")), true,
+                    platform::errors::InvalidArgument(
+                        "Output(Scale@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true,
+                    platform::errors::InvalidArgument(
+                        "Output(Bias@GRAD) should not be null."));
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const int C = x_dims[x_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_batch_norm_act, ops::FusedBatchNormActOp,
+    ops::FusedBatchNormActOpMaker, ops::FusedBatchNormActOpInferVarType,
+    ops::FusedBatchNormActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_batch_norm_act_grad, ops::FusedBatchNormActGradOp);
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true,
+                      platform::errors::PreconditionNotMet(
+                          "The Input dim size should be between 2 and 5"));
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    // Run training mode.
+    // obtain running mean and running inv var, and see if we need to
+    // initialize them.
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = act(x).
+      auto x_v = framework::EigenVector<T>::Flatten(*x);
+      auto y_v = framework::EigenVector<T>::Flatten(*y);
+      auto &dev = *dev_ctx.eigen_device();
+      if (act_type == "relu") {
+        ReluFunctor<T>()(dev, x_v, y_v);
+      } else {
+        PADDLE_THROW(
+            platform::errors::Unimplemented("Unsupported activation type"));
+      }
+      return;
+    }
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&data_desc_)."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+
+    VLOG(3) << "Setting descriptors.";
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_, CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
+        platform::errors::External(
+            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnDeriveBNTensorDescriptor."));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/nullptr,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            nullptr, nullptr, data_desc_, y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnBatchNormalizationForwardTrainingEx."));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(data_desc_)."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+  }
+};
+
+template <typename T>
+class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true,
+                      platform::errors::PreconditionNotMet(
+                          "The Input dim size should be between 2 and 5"));
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      if (act_type == "relu") {
+        auto x_v = framework::EigenVector<T>::Flatten(*x);
+        auto y_v = framework::EigenVector<T>::Flatten(*y);
+        auto dx_v = framework::EigenVector<T>::Flatten(*d_x);
+        auto dy_v = framework::EigenVector<T>::Flatten(*d_y);
+        auto &dev = *dev_ctx.eigen_device();
+        ReluGradFunctor<T>()(dev, x_v, y_v, dy_v, dx_v);
+      } else {
+        PADDLE_THROW(
+            platform::errors::Unimplemented("Unsupported activation type"));
+      }
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&data_desc_)."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_, CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
+        platform::errors::External(
+            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnDeriveBNTensorDescriptor."));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/nullptr,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/nullptr,
+            /*dzData=*/nullptr,
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template mutable_data<T>(ctx.GetPlace()),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale
+                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+            /*dBnBiasData=*/d_bias
+                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnBatchNormalizationBackwardEx."));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(data_desc_)."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_batch_norm_act,
+    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, float>,
+    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, double>,
+    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    fused_batch_norm_act_grad,
+    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, float>,
+    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, double>,
+    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, plat::float16>);
+#endif
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override;
+};
+
+class FusedBatchNormActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  std::unique_ptr<T> Apply() const override {
+    std::unique_ptr<T> op(new T());
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+
+    return op;
+  }
+};
+
+class FusedBatchNormActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1994,6 +1994,26 @@ All parameter, weight, gradient are variables in Paddle.
                        build_strategy = fluid.BuildStrategy()
                        build_strategy.fuse_elewise_add_act_ops = True
                     )DOC")
+      .def_property(
+          "fuse_bn_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy is finlaized."));
+            self.fuse_bn_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
+                to fuse batch_norm and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle.fluid as fluid
+                        build_strategy = fluid.BuildStrategy()
+                        build_strategy.fuse_bn_act_ops = True
+                     )DOC")
      .def_property(
          "fuse_relu_depthwise_conv",
          [](const BuildStrategy &self) {

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -182,6 +182,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
+list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)

 if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_dataset)
@@ -301,6 +302,7 @@ py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_
 py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
 py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
+py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)

 if(NOT WIN32)
    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
@@ -330,6 +332,6 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
        test_parallel_executor_crf test_sync_batch_norm_op
        test_parallel_executor_feed_persistable_var
        test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
        test_optimizer_in_control_flow
        test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import unittest
+
+
+class TestFuseBatchNormActPass(unittest.TestCase):
+    def build_program(self, main_program, startup_program, use_cuda, seed=1):
+        main_program.random_seed = seed
+        startup_program.random_seed = seed
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            hidden1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                bias_attr=False,
+                data_format='NHWC')
+            param_attr = fluid.ParamAttr(
+                name='batch_norm_w',
+                initializer=fluid.initializer.Constant(value=1.0))
+            bias_attr = fluid.ParamAttr(
+                name='batch_norm_b',
+                initializer=fluid.initializer.Constant(value=0.0))
+            hidden2 = fluid.layers.batch_norm(
+                input=hidden1,
+                param_attr=param_attr,
+                bias_attr=bias_attr,
+                act='relu',
+                data_layout='NHWC')
+            hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
+            hidden4 = fluid.layers.batch_norm(
+                input=hidden3, act='relu', data_layout='NHWC')
+            prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            if use_cuda:
+                sgd = fluid.contrib.mixed_precision.decorate(
+                    sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_program(main_program, startup_program, use_cuda)
+        exe = fluid.Executor(place)
+        iters = 10
+        batch_size = 16
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+
+        # close fused_bn_act_ops
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_bn_act_ops = False
+        binary = fluid.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(binary,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # open fused_bn_act_ops
+        build_strategy_fused = fluid.BuildStrategy()
+        build_strategy_fused.fuse_bn_act_ops = True
+        binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy_fused)
+        train_reader_fused = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals_fused = []
+        scope_fused = fluid.Scope()
+        with fluid.scope_guard(scope_fused):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader_fused())
+                loss_v = exe.run(binary_fused,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_act_pass_cpu(self):
+        place = fluid.CPUPlace()
+        self.check(place, use_cuda=False)
+
+    def test_fuse_bn_act_pass_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()