diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c1ba6606f1064750a9d7e087ded1ec3634bcc4a5..d5966ad5a97a97ec40c8a01d2d2c8ed5d7f90421 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -93,5 +93,6 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass
+        fuse_elewise_add_act_pass multi_batch_merge_pass 
+        fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index df0ff772c9d35c88ec5a6112525c56aa92d359b9..756470c5b0bfb2ce2a90531442211bcb19172dae 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -55,6 +55,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // Add op fusion.
+    if (strategy.fuse_relu_depthwise_conv_) {
+      AppendPass("fuse_relu_depthwise_conv_pass");
+    }
     if (strategy.fuse_elewise_add_act_ops_) {
       auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
       // Add a graph viz pass to record a graph.
@@ -210,6 +213,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Set<const std::vector<OpDesc *>>(
           kAllOpDescs,
           new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
+      if (!use_cuda) {
+        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
+                        "GPU, skipped.";
+        continue;
+      }
     }
     graph = pass->Apply(std::move(graph));
   }
@@ -220,6 +229,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle
 
+USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 15c2e01b6142571883c759efb1e26b609be9adb4..603df2e06936e3d9d8e7ec62efd0c6e83200239c 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
+  bool fuse_relu_depthwise_conv_{false};
+
   bool memory_optimize_{false};
 
   bool memory_early_delete_{false};
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 84b5321264796b18ac5bf666bc4b4ac403d9e4ea..b118dccd1b3de881b4791bff6cd331726c8e05da 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -70,6 +70,7 @@ if(WITH_MKLDNN)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d94008ea82d0e09732d4b6448fdded94b60733c
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  graph = FuseReluDepthwiseConv(std::move(graph), true);
+  graph = FuseReluDepthwiseConv(std::move(graph), false);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
+    std::unique_ptr<ir::Graph> graph, bool only_forward) const {
+  PADDLE_ENFORCE(graph.get());
+  if (only_forward)
+    FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get());
+  else
+    FusePassBase::Init("relu_depthwise_conv", graph.get());
+  /*
+           x ---act--> y ---layer-> z
+            +----------+
+            ↓          ↓
+    x' <--act'--- y' <-layer'--- z'
+
+    fuse to:
+
+           x ---act-layer-> z
+           |
+           ↓
+    x' <--act-layer'--- z'
+
+  */
+
+  GraphPatternDetector gpd;
+  auto *pattern = gpd.mutable_pattern();
+  std::string act_type = "relu";
+  std::string layer_type = "depthwise_conv2d";
+  auto *x = pattern->NewNode("x")->AsInput();
+  auto *y = pattern->NewNode("y")->AsIntermediate();
+  auto *z = pattern->NewNode("z")->AsOutput();
+  PDNode *xg = nullptr;
+  PDNode *yg = nullptr;
+  PDNode *zg = nullptr;
+  if (!only_forward) {
+    xg = pattern->NewNode("xg")->AsOutput();
+    yg = pattern->NewNode("yg")->AsIntermediate();
+    zg = pattern->NewNode("zg")->AsInput();
+  }
+
+  PDNode *act_g = nullptr;
+  PDNode *layer_g = nullptr;
+  auto *act = pattern->NewNode("act")->assert_is_op(act_type);
+  auto *layer = pattern->NewNode("layer")->assert_is_op(layer_type);
+  if (!only_forward) {
+    act_g = pattern->NewNode("act_g")->assert_is_op(act_type + "_grad");
+    layer_g = pattern->NewNode("layer_g")->assert_is_op(layer_type + "_grad");
+  }
+
+  act->LinksFrom({x}).LinksTo({y});
+  layer->LinksFrom({y}).LinksTo({z});
+  if (!only_forward) {
+    layer_g->LinksFrom({y, zg}).LinksTo({yg});
+    act_g->LinksFrom({y, yg}).LinksTo({xg});
+  }
+
+  int count = 0;
+  std::unordered_set<const Node *> need_removed_nodes;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseReluDepthwiseConv fuse";
+    // 1. turn on fuse option
+    auto *layer_op = subgraph.at(layer)->Op();
+    layer_op->SetAttr("use_cudnn", false);
+    layer_op->SetAttr("fuse_relu_before_depthwise_conv", true);
+
+    OpDesc *layer_g_op = nullptr;
+    if (!only_forward) {
+      layer_g_op = subgraph.at(layer_g)->Op();
+      layer_g_op->SetAttr("use_cudnn", false);
+      layer_g_op->SetAttr("fuse_relu_before_depthwise_conv", true);
+    }
+    // 2. connect x to layer and layer_g, layer_g to xg
+    auto *y_var = subgraph.at(y)->Var();
+    auto *x_var = subgraph.at(x)->Var();
+    VarDesc *yg_var = nullptr;
+    VarDesc *xg_var = nullptr;
+    if (!only_forward) {
+      yg_var = subgraph.at(yg)->Var();
+      xg_var = subgraph.at(xg)->Var();
+    }
+
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
+    layer_op->SetInput("Input", {x_var->Name()});
+    subgraph.at(layer)->inputs.push_back(subgraph.at(x));
+    subgraph.at(x)->outputs.push_back(subgraph.at(layer));
+    VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
+
+    if (!only_forward) {
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
+      layer_g_op->SetInput("Input", {x_var->Name()});
+      subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
+      subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
+
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
+                        yg_var->Name());
+      layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
+      subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg));
+      subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g));
+      VLOG(4) << "replace " << yg_var->Name() << " -> " << xg_var->Name();
+    }
+
+    // 3. delete y, yg, act, act_g
+
+    if (only_forward) {
+      need_removed_nodes.insert({subgraph.at(y), subgraph.at(act)});
+    } else {
+      need_removed_nodes.insert({subgraph.at(y), subgraph.at(yg),
+                                 subgraph.at(act), subgraph.at(act_g)});
+    }
+    count++;
+  };
+  gpd(graph.get(), handler);
+  GraphSafeRemoveNodes(graph.get(), need_removed_nodes);
+  AddStatis(count);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_relu_depthwise_conv_pass,
+              paddle::framework::ir::FuseReluDepthwiseConvPass);
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bd653775e42c1ee16051e205e9fa9888ea05eaa
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the relu and depthwise conv
+ */
+class FuseReluDepthwiseConvPass : public FusePassBase {
+ public:
+  virtual ~FuseReluDepthwiseConvPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
+      std::unique_ptr<ir::Graph> graph, bool only_forward) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 16ffc11419f667e4d1c7f6c9a5be355478a48fab..0ce174654e85175f0b949f860a00afafc548ed3e 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -143,7 +143,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // Get unique name for storing MKLDNN primitives
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
-        ctx.op().Output("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     std::vector<primitive> pipeline;
@@ -371,7 +371,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     platform::ConvMKLDNNHandler::AppendKey(
         &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
         input->format(), fuse_relu, fuse_residual_conn,
-        ctx.op().Output("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     bool need_s8_to_u8 = false;
@@ -798,7 +798,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const Tensor* input = ctx.Input<Tensor>("Input");
     const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output = ctx.Input<Tensor>("Output");
     const Tensor* output_grad =
         ctx.Input<Tensor>(framework::GradVarName("Output"));
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
@@ -810,9 +809,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
-    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
-                       output->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Output tensor");
     PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
                        output_grad->format() != memory::format::format_undef,
                    "Wrong layout/format set for output_grad tensor");
@@ -840,18 +836,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     int g = std::max(groups, 1);
     GetWeightsTz(weights_tz, g, is_conv3d);
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> dst_tz =
+        paddle::framework::vectorize2int(output_grad->dims());
 
     auto src_format = input->format();
     mkldnn::memory::format weights_format =
         GetWeightsFormat(filter->format(), g, is_conv3d);
 
-    // Get an unique name from "argument" name of "Output" variable
+    // Get an unique name from "argument" name of "input" and "Filter" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
-        ctx.op().Input("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index c8b33b8932ddd3bb9706d5b555ca68df4560a31e..bd788f03e7d666aad7ce6f0c63cea30f029e3491 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -171,6 +171,9 @@ void Conv2DOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu_before_depthwise_conv",
+                "(bool, default false) Only used in cuda depthwise kernel")
+      .SetDefault(false);
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
@@ -412,18 +415,43 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
                                  customized_type_value);
 }
 
+class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType(GradOpType());
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput("Bias", Input("Bias"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    op->SetAttrMap(Attrs());
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
 
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index eaa288edc554d2b62eb67ca01ed2459a88772430..797c665165975b4230c0edda460e3eebba8e400c 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -397,12 +397,18 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-
+    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                  output);
+
+    if (fuse_relu) {
+      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output);
+    } else {
+      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output);
+    }
   }
 };
 
@@ -424,27 +430,42 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-        depthwiseConvFilterGrad;
-
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, dilations, input_grad);
+
+      if (fuse_relu) {
+        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
+            depthwiseConvInputGrad;
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad);
+      } else {
+        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
+            depthwiseConvInputGrad;
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad);
+      }
     }
 
     if (filter_grad) {
       filter_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              dilations, filter_grad);
+      if (fuse_relu) {
+        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
+            depthwiseConvFilterGrad;
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad);
+      } else {
+        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
+            depthwiseConvFilterGrad;
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 4fa15058f86fbcf689bf3a9f2cd5e4c5d75b5323..e18d9841bb87c6a684d53e1bceb6c20a37dcfcfa 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/group_norm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -102,8 +103,8 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mean"),
                    "Input(Mean) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Variance"),
@@ -113,7 +114,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
     // check output
     if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
     }
     if (ctx->HasOutput(framework::GradVarName("Scale"))) {
       ctx->SetOutputDim(framework::GradVarName("Scale"),
@@ -145,12 +146,36 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("group_norm_grad");
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("Bias", Input("Bias"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetInput("Y", Output("Y"));
+    op->SetInput("Mean", Output("Mean"));
+    op->SetInput("Variance", Output("Variance"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+
+    op->SetAttrMap(Attrs());
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::GroupNormGradMaker);
 REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 27174630227c8123a31cb1c70d5eb5f5b3ee5107..6e460c470be71bfaaa37b4ef796027c2e2b9e376 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -12,12 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cub/cub.cuh>
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/group_norm_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
 namespace operators {
 
+enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+
+#define CHECK_CASE(i, flags, kernel_name, args...)                   \
+  if (i == flags) {                                                  \
+    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \
+  }
+
+// 0 for no scale, no bias
+// 1 for has scale, no bias
+// 2 for no scale, has bias
+// 3 for has scale, has bias
+#define UNROLL_ALL_CASES(flags, kernel_name, args...) \
+  CHECK_CASE(0, flags, kernel_name, args)             \
+  CHECK_CASE(1, flags, kernel_name, args)             \
+  CHECK_CASE(2, flags, kernel_name, args)             \
+  CHECK_CASE(3, flags, kernel_name, args)
+
+template <typename T>
+__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+  typedef cub::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+  value = WarpReduce(temp_storage).Sum(value);
+  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
+}
+
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
                                               int imsize, int groups,
@@ -36,21 +62,11 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
   }
   x_mean /= number * imsize;
   x_var /= number * imsize;
-  __shared__ T s_mem[2];
-  if (threadIdx.x == 0) {
-    s_mem[0] = s_mem[1] = 0;
-  }
-  __syncthreads();
-  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
-  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
-    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
-  }
+  CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean);
+  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
-template <typename T>
+template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                  const T* scale, const T* bias, int N, int C,
                                  int imsize, int groups, int group_size,
@@ -68,8 +84,8 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val = x[(bid * C + ccid) * imsize + imid];
     val = (val - x_mean) * var_inv;
-    if (scale) val *= scale[gid * group_size + cid];
-    if (bias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) val *= scale[gid * group_size + cid];
+    if (flags & kHasBias) val += bias[gid * group_size + cid];
     y[(bid * C + ccid) * imsize + imid] = val;
   }
 }
@@ -115,93 +131,87 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
-    int block_size = std::min(512, imsize);
+    int block_size = std::min(1024, imsize);
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
         x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
         temp_var_data);
-    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
-        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+    int flags =
+        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+    UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
+                     scale_data, bias_data, x_dims[0], x_dims[1], imsize,
+                     groups, group_size, epsilon, y_data, var_data);
   }
 };
 
-template <typename T>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
-    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
-    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+template <typename T, int flags>
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
+                                               const T* bias, const T* d_y,
+                                               int N, int C, int imsize,
+                                               int groups, int group_size,
+                                               T epsilon, T* d_mean, T* d_var,
+                                               T* d_scale, T* d_bias) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
   int number = min(group_size, static_cast<int>(C - gid * group_size));
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  T d_var_inv = 0, d_x_mean = 0;
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
   T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T tmp = x[(bid * C + ccid) * imsize + imid];
-    T val = (tmp - x_mean) * var_inv;
+    T val = x[(bid * C + ccid) * imsize + imid] - x_bias;
     T dval = d_y[(bid * C + ccid) * imsize + imid];
-    if (d_bias) d_bias_data += dval;
-    if (d_scale) d_scale_data += val * dval;
-    if (scale) dval = dval * scale[ccid];
-    d_var_data += (tmp - x_mean) * dval;
-    T d_tmp = dval * var_inv;
-    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
-    d_mean_data -= d_tmp;
-  }
 
-  __shared__ T s_mem[4];
-  if (threadIdx.x == 0) {
-    s_mem[0] = s_mem[1] = 0;
-    if (d_scale) s_mem[2] = 0;
-    if (d_bias) s_mem[3] = 0;
-  }
-  __syncthreads();
-  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
-  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
-  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
-  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
-    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
-    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
-    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+    d_var_data += val * dval;
+    d_mean_data += dval * x_scale;
+
+    val = val * x_scale_inv;
+    d_bias_data += dval;
+    d_scale_data += val * dval;
   }
+  CudaAtomicAddWithWarp(&d_mean[bid * groups + gid], d_mean_data);
+  CudaAtomicAddWithWarp(&d_var[bid * groups + gid], d_var_data);
+  if (flags & kHasScale) CudaAtomicAddWithWarp(&d_scale[ccid], d_scale_data);
+  if (flags & kHasBias) CudaAtomicAddWithWarp(&d_bias[ccid], d_bias_data);
 }
 
-template <typename T>
-__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
-                                  const T* d_mean, const T* d_var, int N, int C,
-                                  int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x) {
+template <typename T, int flags>
+__global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
+                                  const T* bias, const T* var, const T* d_mean,
+                                  const T* d_var, int N, int C, int imsize,
+                                  int groups, int group_size, T epsilon,
+                                  T* d_x) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
   int number = min(group_size, static_cast<int>(C - gid * group_size));
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
   T x_var = var[bid * groups + gid];
   T d_x_mean = d_mean[bid * groups + gid];
-  T d_var_inv = d_var[bid * groups + gid];
+  T d_x_var = d_var[bid * groups + gid];
+
+  T x_var_inv = 1.0 / sqrt(x_var + epsilon);
+  T number_inv = 1.0 / (number * imsize);
+
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
 
-  T d_x_var =
-      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
-  d_x_mean -= 2 * d_x_var * x_mean;
-  d_x_var /= number * imsize;
-  d_x_mean /= number * imsize;
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T tmp = x[(bid * C + ccid) * imsize + imid];
-    if (d_x)
-      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * C + ccid) * imsize + imid];
+    d_x[(bid * C + ccid) * imsize + imid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
   }
 }
 
@@ -211,10 +221,10 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* x = ctx.Input<Tensor>("Y");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
@@ -226,11 +236,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     const auto& x_dims = x->dims();
     const int group_size = (x_dims[1] - 1) / groups + 1;
 
-    T* d_x_data = nullptr;
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      d_x_data = d_x->data<T>();
-    }
+    d_x->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
@@ -245,8 +251,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     T* temp_mean_data = temp_mean.data<T>();
 
     auto* x_data = x->data<T>();
+    T* d_x_data = nullptr;
+    if (d_x) d_x_data = d_x->data<T>();
     auto* y_data = d_y->data<T>();
-    auto* mean_data = mean->data<T>();
     auto* var_data = var->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
@@ -263,18 +270,25 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 
     const T* scale_data = nullptr;
     if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
-    int block_size = std::min(512, imsize);
+    int block_size = std::min(1024, imsize);
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
-    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
-        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
-        temp_var_data, d_scale_data, d_bias_data);
-    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
-        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+    int flags =
+        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
+                     bias_data, y_data, x_dims[0], x_dims[1], imsize, groups,
+                     group_size, epsilon, temp_mean_data, temp_var_data,
+                     d_scale_data, d_bias_data);
+    if (d_x_data != nullptr) {
+      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
+                       bias_data, var_data, temp_mean_data, temp_var_data,
+                       x_dims[0], x_dims[1], imsize, groups, group_size,
+                       epsilon, d_x_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 3d6c6a46a9662e3b99b4e080b424b4794db7fcc3..498e65f614925f746dacacd3453e046a08ff5494 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -96,10 +96,10 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* x = ctx.Input<Tensor>("Y");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
@@ -111,19 +111,13 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     const auto& x_dims = x->dims();
     const int group_size = (x_dims[1] - 1) / groups + 1;
 
-    // TODO(liangdun): need to check d_x is null
+    d_x->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    T* d_x_data = nullptr;
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-      d_x_data = d_x->data<T>();
-    }
 
     auto* x_data = x->data<T>();
+    auto* d_x_data = d_x->data<T>();
     auto* y_data = d_y->data<T>();
-    auto* mean_data = mean->data<T>();
     auto* var_data = var->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
@@ -140,6 +134,8 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
 
     const T* scale_data = nullptr;
     if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
     auto* iter_x_data = x_data;
@@ -147,46 +143,45 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     auto* iter_y_data = y_data;
     for (int bid = 0; bid < x_dims[0]; bid++)
       for (int gid = 0; gid < groups; gid++) {
-        T x_mean = mean_data[bid * groups + gid];
         T x_var = var_data[bid * groups + gid];
         T var_inv = 1.0 / sqrt(x_var + epsilon);
         int number = std::min(group_size,
                               static_cast<int>(x_dims[1] - gid * group_size));
-        auto* tmp = iter_x_data;
-        auto* tmp2 = iter_d_x_data;
-        T d_var_inv = 0, d_x_mean = 0;
+        T number_inv = 1.0 / (number * imsize);
+        auto* iter_x_data2 = iter_x_data;
+        auto* iter_y_data2 = iter_y_data;
+        T dp_scale = 0, dp_bias = 0;
         for (int cid = 0; cid < number; cid++) {
           for (int imid = 0; imid < imsize;
-               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
-            T val = (tmp[0] - x_mean) * var_inv;
+               imid++, iter_x_data++, iter_y_data++) {
+            T val = iter_x_data[0];
+            if (bias_data) val -= bias_data[gid * group_size + cid];
             T dval = iter_y_data[0];
+            dp_scale += val * dval;
+            dp_bias += dval * scale_data[gid * group_size + cid];
+
+            if (scale_data && scale_data[gid * group_size + cid] != 0)
+              val /= scale_data[gid * group_size + cid];
             if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
             if (d_scale_data)
               d_scale_data[gid * group_size + cid] += val * dval;
-            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
-
-            d_var_inv += (tmp[0] - x_mean) * dval;
-            T d_tmp = dval * var_inv;
-            if (d_x_data) iter_d_x_data[0] += d_tmp;
-            d_x_mean -= d_tmp;
           }
         }
 
-        T d_x_var =
-            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
-        d_x_mean -= 2 * d_x_var * x_mean;
-        d_x_var /= number * imsize;
-        d_x_mean /= number * imsize;
-
-        iter_d_x_data = tmp2;
-
-        if (d_x_data) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data++, iter_d_x_data++) {
-              iter_d_x_data[0] += d_x_mean;
-              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
-            }
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, iter_d_x_data++, iter_x_data2++, iter_y_data2++) {
+            T v_y = iter_x_data2[0];
+            T dly = iter_y_data2[0];
+            T dss = dp_scale;
+            T dbs = dp_bias;
+            T v_scale = scale_data[gid * group_size + cid];
+            T v_bias = bias_data[gid * group_size + cid];
+            v_y -= v_bias;
+            if (v_scale != 0) v_y /= v_scale;
+            iter_d_x_data[0] =
+                (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
+                var_inv;
           }
         }
       }
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 66d37c3bf31ffa420cc527cb576dcdc5505a0960..240cec14dc2662d154a0ef476e197ab446674a54 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -22,28 +24,11 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__inline__ __device__ T warpReduceSum(T val) {
-#if CUDA_VERSION < 9000
-  for (int offset = 16; offset > 0; offset /= 2)
-    val += __shfl_down(val, offset);
-  return val;
-#else
-#define FULL_MASK 0xffffffff
-  for (int offset = 16; offset > 0; offset /= 2)
-    val += __shfl_down_sync(FULL_MASK, val, offset);
-  return val;
-#endif
-}
-__forceinline__ __device__ unsigned lane_id() {
-  unsigned ret;
-  asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
-  return ret;
-}
-
-__forceinline__ __device__ unsigned warp_id() {
-  unsigned ret;
-  asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
-  return ret;
+__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+  typedef cub::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+  value = WarpReduce(temp_storage).Sum(value);
+  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
 }
 
 #define ARG_DEFINE_KernelDepthwiseConv                                         \
@@ -58,7 +43,7 @@ __forceinline__ __device__ unsigned warp_id() {
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
     for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
@@ -87,7 +72,11 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
           if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
               w_in < w_end) {
             const int offset = in_offset + h_in * input_width + w_in;
-            value += weight[weight_offset] * input_data[offset];
+            if (fuse_relu_before_conv) {
+              value += weight[weight_offset] * max(0.0f, input_data[offset]);
+            } else {
+              value += weight[weight_offset] * input_data[offset];
+            }
           }
           weight_offset++;
         }
@@ -100,7 +89,7 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   }
 }
 
-template <typename T, int c_filter>
+template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvCFilter(
     ARG_DEFINE_KernelDepthwiseConv) {
   const int kWeghtSize = c_filter * c_filter;
@@ -137,7 +126,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
             const int offset = in_offset + h_in * input_width + w_in;
-            value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+            if (fuse_relu_before_conv) {
+              value += r_weight[h_f * c_filter + w_f] *
+                       max(0.0f, input_data[offset]);
+            } else {
+              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+            }
           }
         }
       }
@@ -149,18 +143,19 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   if (c_filter_multiplier == 0) {
     if (c_filter == -1)
-      KernelDepthwiseConv<T>(
+      KernelDepthwiseConv<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           filter_multiplier, filter_height, filter_width, stride_height,
           stride_width, padding_height, padding_width, dilate_height,
           dilate_width, output_data);
     else
-      KernelDepthwiseConvCFilter<T, c_filter>(
+      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           filter_multiplier, filter_height, filter_width, stride_height,
@@ -168,14 +163,14 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
           dilate_width, output_data);
   } else {
     if (c_filter == -1)
-      KernelDepthwiseConv<T>(input_data, filter_data, batch_size,
-                             output_channels, output_height, output_width,
-                             input_channels, input_height, input_width,
-                             c_filter_multiplier, filter_height, filter_height,
-                             c_stride, c_stride, padding_height, padding_width,
-                             dilate_height, dilate_width, output_data);
+      KernelDepthwiseConv<T, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
+          padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
     else
-      KernelDepthwiseConvCFilter<T, c_filter>(
+      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
@@ -186,17 +181,18 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
 
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 #define ARG_DEFINE_KernelDepthwiseConvInputGrad                                \
-  const T *const output_grad_data, const T *const filter_data,                 \
-      const int batch_size, const int output_channels,                         \
-      const int output_height, const int output_width,                         \
-      const int input_channels, const int input_height, const int input_width, \
+  const T *const input_data, const T *const output_grad_data,                  \
+      const T *const filter_data, const int batch_size,                        \
+      const int output_channels, const int output_height,                      \
+      const int output_width, const int input_channels,                        \
+      const int input_height, const int input_width,                           \
       const int filter_multiplier, const int filter_height,                    \
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
       const int dilate_height, const int dilate_width,                         \
       T *const input_grad_data
 
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGrad(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
@@ -217,6 +213,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
       int w_out_end = w_in + padding_width;
 
       T value = 0;
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
 
       for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
            c_out++) {
@@ -242,15 +247,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
           }
         }
       }
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
       input_grad_data[index] = value;
     }
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier>
+template <typename T, int c_filter, int c_filter_multiplier,
+          bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
@@ -276,6 +279,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
       int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value = 0;
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
 
       for (int c_i = 0; c_i < filter_multiplier; c_i++) {
         int c_out = c_in * filter_multiplier + c_i;
@@ -300,34 +312,33 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
           }
         }
       }
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
       input_grad_data[index] = value;
     }
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, input_grad_data);
   else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
         input_grad_data);
   else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
+                                        fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
@@ -335,7 +346,7 @@ __global__ void KernelDepthwiseConvInputGradSp(
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGrad(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -347,7 +358,6 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
   T s = 0;
 
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
-  int lid = lane_id();
 
   for (int image_w = threadIdx.x; image_w < output_width;
        image_w += blockDim.x) {
@@ -364,28 +374,28 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
         if (image_wk < 0 || image_wk >= input_width) continue;
 #define gaid(N, C, H, W) \
   ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-
-        s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-             input_data[((bid * (gridDim.z / filter_multiplier) +
-                          kernel_id / filter_multiplier) *
-                             input_height +
-                         image_hk) *
-                            input_width +
-                        image_wk];
+        int input_id = ((bid * (gridDim.z / filter_multiplier) +
+                         kernel_id / filter_multiplier) *
+                            input_height +
+                        image_hk) *
+                           input_width +
+                       image_wk;
+        if (fuse_relu_before_conv) {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               max(0.0f, input_data[input_id]);
+        } else {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               input_data[input_id];
+        }
 
 #undef gaid
       }
     }
   }
-#if __CUDA_ARCH__ >= 530
-  s = warpReduceSum<T>(s);
-  if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
-#else
-  paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
-#endif
+  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
 }
 
-template <typename T, int c_filter_multiplier>
+template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvFilterGradSp(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -395,14 +405,14 @@ __global__ void KernelDepthwiseConvFilterGradSp(
     const int padding_height, const int padding_width, const int dilate_height,
     const int dilate_width, T* filter_grad_data) {
   if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T>(
+    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
         output_grad_data, input_data, num, output_channels, output_height,
         output_width, input_channels, input_height, input_width,
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, filter_grad_data);
   else
-    KernelDepthwiseConvFilterGrad<T>(
+    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
         output_grad_data, input_data, num, output_channels, output_height,
         output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, stride_height,
@@ -415,8 +425,9 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <class T>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+template <class T, bool fuse_relu_before_conv>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
+                           fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -446,6 +457,10 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int thread = 512;
+    if (output_width > 1024 && output_width <= 2048)
+      thread = (output_width - 1) / 2 + 1;
+    else if (output_width > 512 && output_width <= 1024)
+      thread = output_width;
     int blocks = std::min(std::max(thread / output_width, 1), output_height);
     dim3 threads(std::min(output_width, thread), blocks, 1);
     dim3 grid(output_channels, batch_size, 1);
@@ -456,8 +471,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
           stride_height == stride_width && stride_height == c_stride &&      \
           (ksize_height == ksize_width && ksize_height == c_filter ||        \
            c_filter == -1)) {                                                \
-    KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride,                  \
-                          c_filter><<<grid, threads, 0, context.stream()>>>( \
+    KernelDepthwiseConvSp<                                                   \
+        T, c_filter_multiplier, c_stride, c_filter,                          \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
         input_data, filter_data, batch_size, output_channels, output_height, \
         output_width, input_channels, input_height, input_width,             \
         filter_multiplier, ksize_height, ksize_width, stride_height,         \
@@ -480,8 +496,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool fuse_relu_before_conv>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
+                                    fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -507,11 +524,16 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
     const int dilate_height = dilations[0];
     const int dilate_width = dilations[1];
 
+    const T* input_data = input.data<T>();
     const T* filter_data = filter.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int thread = 512;
+    if (input_width > 1024 && input_width <= 2048)
+      thread = (input_width - 1) / 2 + 1;
+    else if (input_width > 512 && input_width <= 1024)
+      thread = input_width;
     int blocks = std::min(std::max(thread / input_width, 1), input_height);
     dim3 threads(std::min(input_width, thread), blocks, 1);
     dim3 grid(input_channels, batch_size, 1);
@@ -524,13 +546,13 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
           (ksize_height == ksize_width && ksize_height == c_filter ||   \
            c_filter == -1)) {                                           \
     KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride,                               \
-        c_filter><<<grid, threads, 0, context.stream()>>>(              \
-        output_grad_data, filter_data, batch_size, output_channels,     \
-        output_height, output_width, input_channels, input_height,      \
-        input_width, filter_multiplier, ksize_height, ksize_width,      \
-        stride_height, stride_width, padding_height, padding_width,     \
-        dilate_height, dilate_width, input_grad_data);                  \
+        T, c_filter_multiplier, c_stride, c_filter,                     \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+        input_data, output_grad_data, filter_data, batch_size,          \
+        output_channels, output_height, output_width, input_channels,   \
+        input_height, input_width, filter_multiplier, ksize_height,     \
+        ksize_width, stride_height, stride_width, padding_height,       \
+        padding_width, dilate_height, dilate_width, input_grad_data);   \
     return;                                                             \
   }
     check_case(1, 1, 3);
@@ -552,8 +574,9 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool fuse_relu_before_conv>
+class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+                                     fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -583,6 +606,10 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
     T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
 
     int block_size = 512;
+    if (output_width > 1024 && output_width <= 2048)
+      block_size = (output_width - 1) / 2 + 1;
+    else if (output_width > 512 && output_width <= 1024)
+      block_size = output_width;
     int crop_output_height =
         std::min(std::max(block_size / output_width, 1), output_height);
     dim3 grid(ksize_width, ksize_height, output_channels);
@@ -592,7 +619,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
 #define check_case(c_filter_multiplier)                                       \
   if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
     KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>(      \
+        T, c_filter_multiplier,                                               \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
         output_grad_data, input_data, batch_size, output_channels,            \
         output_height, output_width, input_channels, input_height,            \
         input_width, filter_multiplier, ksize_height, ksize_width,            \
@@ -606,18 +634,31 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
 
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+                                             false>;
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             float>;
+                                             double, false>;
+
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              float, false>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              double, false>;
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+                                             true>;
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double>;
+                                             double, true>;
 
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float>;
+                                              float, true>;
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double>;
+                                              double, true>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
index 71f6fcb23df1942d6dcf7177165f2ec1022a9b35..56648e4125bf1f5ef3eb128efa7849bbf5d4ad71 100644
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -26,7 +26,8 @@ namespace math {
  * \brief Compute the depthwise convolution which include
  * forward process and backpropagation process
  */
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
@@ -36,7 +37,8 @@ class DepthwiseConvFunctor {
                   const std::vector<int>& dilations, framework::Tensor* output);
 };
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvInputGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
@@ -48,7 +50,8 @@ class DepthwiseConvInputGradFunctor {
                   framework::Tensor* input_grad);
 };
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvFilterGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 96fa428ee36bd8639d956623ec0554be7ae44501..96d0d16bf78a1f3fdc9f1ab755ea443bed0f4763 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1023,6 +1023,20 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
+      .def_property(
+          "fuse_relu_depthwise_conv",
+          [](const BuildStrategy &self) {
+            return self.fuse_relu_depthwise_conv_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.fuse_relu_depthwise_conv_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether
+                      to fuse relu and depthwise_conv2d,
+                      it will save GPU memory and may make the execution faster.
+                      This options is only available in GPU devices.
+                      Default False)DOC")
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index baa14a573fcfdfa943af1e995f687c74e9fb4d07..1f7ec69dd7544a2835b9e336491c9d0fa2c76925 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -76,7 +76,7 @@ def memory_usage(program, batch_size):
 
     # Get the var_name list of first block and calculate
     total_memory = 0.0
-    processed_var_names = set()
+    processed_var_names = set(["@EMPTY@"])
     for op in program.global_block().ops:
         for var_name in op.output_arg_names:
             if var_name in processed_var_names:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ea88d8b4d09ab5816eabdfb978ee31143b9da5cd..503c91c27bafa764bec5cc3d22153e91ae3787d9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1972,6 +1972,7 @@ def conv2d(input,
             'groups': groups,
             'use_cudnn': use_cudnn,
             'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False
         })
 
     pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 1ba47d5a57665e00f08ffde9ebf3b5b10412c2ee..fdacd241f9e1f8d442f55098e2d192a3d57fdaf1 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -42,6 +42,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_reduce=False,
                                   use_ir_memory_optimize=False,
                                   fuse_elewise_add_act_ops=False,
+                                  fuse_relu_depthwise_conv=False,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
                                   enable_sequential_execution=False):
@@ -60,7 +61,8 @@ class TestParallelExecutorBase(unittest.TestCase):
 
             loss = method(use_feed=feed_dict is not None)
 
-            optimizer().minimize(loss)
+            if optimizer:
+                optimizer().minimize(loss)
 
             if memory_opt:
                 fluid.memory_optimize(main)
@@ -76,6 +78,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
             build_strategy.memory_optimize = use_ir_memory_optimize
             build_strategy.enable_sequential_execution = enable_sequential_execution
             if use_cuda and core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 25a9e8d46edb663600a1c1007cdda673e348a881..2927a9828fd5bbb9dd484487c7461c43a011fc87 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -70,6 +70,7 @@ class TestConv2dOp(OpTest):
         self.exhaustive_search = False
         self.use_cuda = False
         self.use_mkldnn = False
+        self.fuse_relu_before_depthwise_conv = False
         self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
@@ -84,8 +85,17 @@ class TestConv2dOp(OpTest):
         }
 
         input = np.random.random(self.input_size).astype(self.dtype)
+        if not self.testcuda():
+            self.fuse_relu_before_depthwise_conv = False
+        if self.fuse_relu_before_depthwise_conv:
+            input = input - 0.5
+            input -= (input < 0) * 0.1
+            input += (input >= 0) * 0.1
+            input2 = np.maximum(input, 0.0)
+        else:
+            input2 = input
         filter = np.random.random(self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
                                                   conv2d_param)
         output = output.astype(self.dtype)
 
@@ -101,6 +111,8 @@ class TestConv2dOp(OpTest):
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn,
             'data_format': self.data_format,
+            'fuse_relu_before_depthwise_conv':
+            self.fuse_relu_before_depthwise_conv,
             'exhaustive_search': self.exhaustive_search
         }
         self.outputs = {'Output': output}
@@ -364,6 +376,78 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
         self.op_type = "depthwise_conv2d"
 
 
+class TestDepthwiseConvandFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilationandFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation2andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
 class TestCUDNNExhaustiveSearch(TestConv2dOp):
     def init_kernel_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c8531606b8e74e62ed7343a3d795b7438f61cd0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def norm(*args, **kargs):
+    return fluid.layers.batch_norm(*args, **kargs)
+
+
+def sep_conv(input, channel, stride, filter, dilation=1, act=None):
+    # with scope('depthwise'):
+    input = fluid.layers.conv2d(
+        input,
+        input.shape[1],
+        filter,
+        stride,
+        groups=input.shape[1],
+        padding=(filter // 2) * dilation,
+        dilation=dilation,
+        use_cudnn=False,
+        bias_attr=False)
+    input = norm(input)
+    if act: input = act(input)
+    # with scope('pointwise'):
+    input = fluid.layers.conv2d(
+        input, channel, 1, 1, groups=1, padding=0, bias_attr=False)
+    input = norm(input)
+    if act: input = act(input)
+    return input
+
+
+def simple_depthwise_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    hidden = fluid.layers.reshape(img, (-1, 1, 28, 28))
+    for _ in range(4):
+        hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
+        hidden = fluid.layers.relu(hidden)
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare(self, model, use_cuda, random_data=True, only_forward=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        img, label = self._init_data(random_data)
+
+        def _optimizer(learning_rate=1e-6):
+            optimizer = fluid.optimizer.SGD(
+                learning_rate=learning_rate,
+                regularization=fluid.regularizer.L2Decay(1e-6))
+            return optimizer
+
+        if only_forward:
+            _optimizer = None
+
+        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_relu_depthwise_conv=True,
+            use_ir_memory_optimize=True,
+            memory_opt=False,
+            optimizer=_optimizer)
+        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_relu_depthwise_conv=False,
+            memory_opt=False,
+            optimizer=_optimizer)
+
+        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_depthwise_with_fuse_op(self):
+        self._compare(simple_depthwise_net, True)
+        self._compare(simple_depthwise_net, False)
+
+    def test_simple_depthwise_with_fuse_op_only_forward(self):
+        self._compare(simple_depthwise_net, True, only_forward=True)
+        self._compare(simple_depthwise_net, False, only_forward=True)
+
+
+if __name__ == '__main__':
+    unittest.main()