Optimization of Kernels that related to DeepLabv3+ (#13534)

* refine reduce by cub * optimize KernelDepthwiseConvFilterGrad * optimize depthwise conv and reduce mean and reduce sum * fix bug: dilation * cuda arch and cuda 8 compatible

Optimization of Kernels that related to DeepLabv3+ (#13534)
* refine reduce by cub * optimize KernelDepthwiseConvFilterGrad * optimize depthwise conv and reduce mean and reduce sum * fix bug: dilation * cuda arch and cuda 8 compatible
161c3e31 · Dun · qingqing01 · 35b713c3 · 161c3e31 · 161c3e31
9 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
+    op_library(reduce_mean_op DEPS cub)
 else()
    op_library(conv_op DEPS vol2col im2col)
 endif()

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                  output);
  }
 };

@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, input_grad, static_cast<T>(0));
      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, input_grad);
+                             paddings, dilations, input_grad);
    }

    if (filter_grad) {
      filter_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
        depthwiseConvInputGrad;
    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           output);
+                           dilations, output);
  }
 };

@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");

    if (input_grad) {
      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
                    input_grad);
    }

@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
          depthwiseConvFilterGrad;
      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+namespace detail {
+template <typename T, size_t ElementCount>
+struct Array {
+ public:
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE inline const T& operator[](size_t index) const {
+    return data_[index];
+  }
+
+  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
+
+  template <typename VectorLikeType>
+  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
+    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
+    size_t n = static_cast<size_t>(vec.size());
+    Array<T, ElementCount> ret;
+    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+    return ret;
+  }
+
+ private:
+  T data_[ElementCount];
+};
+
+// reduce the last axis of 2d array
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
+                               TransformOp transformer, Ty init,
+                               int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
+                             TransformOp transformer, Ty init, int reduce_num,
+                             Array<int, Rank> x_strides,
+                             Array<int, ReduceRank> reduce_dim,
+                             Array<int, ReduceRank> reduce_strides,
+                             Array<int, Rank - ReduceRank> left_dim,
+                             Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  Array<int, Rank> sub_index;
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
+
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
+  }
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
+  int n = static_cast<int>(dims.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+constexpr int kMaxBlockDim = 512;
+
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void TensorReduceImpl(
+    const Tx* x_data, Ty* y_data, const platform::Place& place,
+    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
+    int left_num, int reduce_num, const std::vector<int>& x_strides,
+    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
+    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
+    cudaStream_t stream) {
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+
+#define CUB_REDUCE_RANK_CASE(i, ...)                              \
+  case i: {                                                       \
+    constexpr auto kReduceRank = i;                               \
+    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
+                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
+        x_data, y_data, reducer, transformer, init, reduce_num,   \
+        Array<int, kRank>::From(x_strides),                       \
+        Array<int, kReduceRank>::From(reduce_dim),                \
+        Array<int, kReduceRank>::From(reduce_strides),            \
+        Array<int, kRank - kReduceRank>::From(left_dim),          \
+        Array<int, kRank - kReduceRank>::From(left_strides));     \
+  } break
+
+  int rank = x_strides.size();
+  int reduce_rank = reduce_strides.size();
+  if (rank == reduce_rank) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    return;
+  }
+  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
+                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
+        x_data, y_data, reducer, transformer, init, reduce_num);
+    return;
+  }
+  /*
+  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
+    // Currently, it is handled by code below, but inefficient
+    return;
+  }
+  */
+
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5););
+
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
+                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
+  }
+
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+
+}  // namespace detail
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
+                  std::vector<int> origin_reduce_dims, const Ty& init,
+                  const ReduceOp& reducer, const TransformOp& transformer,
+                  cudaStream_t stream) {
+  auto x_dim = framework::vectorize2int(x.dims());
+  std::vector<int> new_x_dim, new_reduce_dims;
+  int is_reduced = 0;
+  for (auto e : origin_reduce_dims) {
+    auto pos = e >= 0 ? e : e + x_dim.size();
+    is_reduced |= 1 << e;
+  }
+  for (int i = 0; i < x_dim.size(); i++) {
+    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+      new_x_dim.push_back(x_dim[i]);
+      if ((is_reduced >> i) & 1)
+        new_reduce_dims.push_back(new_x_dim.size() - 1);
+    } else {
+      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+    }
+  }
+  x_dim = new_x_dim;
+  origin_reduce_dims = new_reduce_dims;
+  int x_rank = static_cast<int>(x_dim.size());
+  std::set<int> left_set, reduce_set;
+  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
+
+  for (auto e : origin_reduce_dims) {
+    left_set.erase(e);
+    reduce_set.insert(e);
+  }
+
+  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
+  std::vector<int> left_dim(left_set.begin(), left_set.end());
+
+  std::vector<int> x_strides = detail::GetStrides(x_dim);
+  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
+  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+  int left_num = 1;
+  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
+
+  std::vector<int> y_dim(left_dim.size());
+  for (int i = 0; i < left_dim.size(); ++i) {
+    y_dim[i] = x_dim[left_dim[i]];
+  }
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+  if (reduce_num == 1) return;
+
+#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
+  case block_dim: {                                                      \
+    constexpr auto kBlockDim = block_dim;                                \
+    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
+        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
+        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
+        left_strides, stream);                                           \
+  } break
+
+  switch (detail::GetDesiredBlockDim(reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -20,149 +21,268 @@ namespace paddle {
 namespace operators {
 namespace math {

+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#if CUDA_VERSION < 9000
+  for (int offset = 16; offset > 0; offset /= 2)
+    val += __shfl_down(val, offset);
+  return val;
+#else
+#define FULL_MASK 0xffffffff
+  for (int offset = 16; offset > 0; offset /= 2)
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  return val;
+#endif
+}
+__forceinline__ __device__ unsigned lane_id() {
+  unsigned ret;
+  asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
+  return ret;
+}
+
+__forceinline__ __device__ unsigned warp_id() {
+  unsigned ret;
+  asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
+  return ret;
+}
+
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
 template <typename T>
-__global__ void KernelDepthwiseConv(
-    const int nthreads, const T* const input_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
+__device__ __inline__ void KernelDepthwiseConv(
+    const T* const input_data, const T* const filter_data, const int batch_size,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T* const output_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / output_channels / output_height / output_width;
-    const int c_out = (index / output_height / output_width) % output_channels;
-    const int h_out = (index / output_width) % output_height;
-    const int w_out = index % output_width;
-
-    const int c_in = c_out / filter_multiplier;
-    const T* weight = filter_data + c_out * filter_height * filter_width;
-    T value = 0;
-    const int h_in_start = -padding_height + h_out * stride_height;
-    const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end = h_in_start + filter_height;
-    const int w_in_end = w_in_start + filter_width;
-
-    const int in_offset =
-        ((batch * input_channels + c_in) * input_height) * input_width;
-
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
-
-    for (int h_in = h_start; h_in < h_end; h_in++) {
-      for (int w_in = w_start; w_in < w_end; w_in++) {
-        const int offset = in_offset + h_in * input_width + w_in;
-        value +=
-            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
-            input_data[offset];
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const output_data) {
+  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
+    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_out = blockIdx.x;
+
+      const int c_in = c_out / filter_multiplier;
+      const T* weight = filter_data + c_out * filter_height * filter_width;
+      T value = 0;
+      const int h_in_start = -padding_height + h_out * stride_height;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      const int h_in_end = h_in_start + filter_height * dilate_height;
+      const int w_in_end = w_in_start + filter_width * dilate_width;
+
+      const int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
+
+      const int h_end = h_in_end < input_height ? h_in_end : input_height;
+      const int w_end = w_in_end < input_width ? w_in_end : input_width;
+      const int h_start = h_in_start > 0 ? h_in_start : 0;
+      const int w_start = w_in_start > 0 ? w_in_start : 0;
+      int weight_offset = 0;
+
+      for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
+        for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
+          if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
+              w_in < w_end) {
+            const int offset = in_offset + h_in * input_width + w_in;
+            value += weight[weight_offset] * input_data[offset];
+          }
+          weight_offset++;
+        }
      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
    }
-    output_data[index] = value;
  }
 }

+template <typename T, int c_filter_multiplier, int c_stride>
+__global__ void KernelDepthwiseConvSp(
+    const T* const input_data, const T* const filter_data, const int batch_size,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const output_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
+                           output_height, output_width, input_channels,
+                           input_height, input_width, filter_multiplier,
+                           filter_height, filter_width, stride_height,
+                           stride_width, padding_height, padding_width,
+                           dilate_height, dilate_width, output_data);
+
+  else
+    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
+                           output_height, output_width, input_channels,
+                           input_height, input_width, c_filter_multiplier,
+                           filter_height, filter_height, c_stride, c_stride,
+                           padding_height, padding_width, dilate_height,
+                           dilate_width, output_data);
+}
+
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <typename T>
-__global__ void KernelDepthwiseConvInputGrad(
-    const int nthreads, const T* const output_grad_data,
-    const T* const filter_data, const int batch_size, const int output_channels,
-    const int output_height, const int output_width, const int input_channels,
-    const int input_height, const int input_width, const int filter_multiplier,
-    const int filter_height, const int filter_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* const input_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / input_channels / input_height / input_width;
-    const int c_in = (index / input_height / input_width) % input_channels;
-    const int h_in = (index / input_width) % input_height;
-    const int w_in = index % input_width;
-
-    const int c_out_start = c_in * filter_multiplier;
-
-    int h_out_start =
-        (h_in - filter_height + padding_height + stride_height) / stride_height;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-
-    int h_out_end = (h_in + padding_height) / stride_height;
-    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
-
-    int w_out_start =
-        (w_in - filter_width + padding_width + stride_width) / stride_width;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-
-    int w_out_end = (w_in + padding_width) / stride_width;
-    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + padding_height - h_out * stride_height;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + padding_width - w_out * stride_width;
-          const int filter_offset = c_out * filter_height * filter_width +
-                                    filter_h * filter_width + filter_w;
-          const int output_grad_offset =
-              ((batch * output_channels + c_out) * output_height + h_out) *
-                  output_width +
-              w_out;
-          value +=
-              output_grad_data[output_grad_offset] * filter_data[filter_offset];
+__device__ __inline__ void KernelDepthwiseConvInputGrad(
+    const T* const output_grad_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const input_grad_data) {
+  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
+    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_in = blockIdx.x;
+
+      const int c_out_start = c_in * filter_multiplier;
+
+      int h_out_start =
+          h_in - (filter_height - 1) * dilate_height + padding_height;
+
+      int h_out_end = h_in + padding_height;
+
+      int w_out_start =
+          w_in - (filter_width - 1) * dilate_width + padding_width;
+
+      int w_out_end = w_in + padding_width;
+
+      T value = 0;
+
+      for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
+           c_out++) {
+        int filter_offset = (c_out + 1) * filter_height * filter_width;
+        for (int h_out = h_out_start; h_out <= h_out_end;
+             h_out += dilate_height) {
+          for (int w_out = w_out_start; w_out <= w_out_end;
+               w_out += dilate_width) {
+            filter_offset--;
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              const int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value += output_grad_data[output_grad_offset] *
+                       filter_data[filter_offset];
+            }
+          }
        }
      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      input_grad_data[index] = value;
    }
-    input_grad_data[index] += value;
  }
 }

+template <typename T, int c_filter_multiplier, int c_stride>
+__global__ void KernelDepthwiseConvInputGradSp(
+    const T* const output_grad_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const input_grad_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConvInputGrad<T>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, input_grad_data);
+  else
+    KernelDepthwiseConvInputGrad<T>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
+        padding_height, padding_width, dilate_height, dilate_width,
+        input_grad_data);
+}
+
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T>
-__global__ void KernelDepthwiseConvFilterGrad(
-    const int nthreads, const T* const output_grad_data,
-    const T* const input_data, const int num, const int output_channels,
-    const int output_height, const int output_width, const int input_channels,
-    const int input_height, const int input_width, const int filter_multiplier,
-    const int filter_height, const int filter_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* const filter_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int w_out = index % output_width;
-    const int h_out = (index / output_width) % output_height;
-    const int c_out = (index / output_width / output_height) % output_channels;
-    const int batch = (index / output_width / output_height / output_channels);
-    const int c_in = c_out / filter_multiplier;
-    const int h_in_start = -padding_height + h_out * stride_height;
-    const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
-        -padding_height + h_out * stride_height + filter_height;
-    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
-    const int in_offset =
-        (batch * input_channels + c_in) * input_height * input_width;
-
-    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
-
-    for (int h_in = h_start; h_in < h_end; h_in++) {
-      for (int w_in = w_start; w_in < w_end; w_in++) {
-        const int offset = in_offset + h_in * input_width + w_in;
-        const T diff_temp = output_grad_data[index] * input_data[offset];
-        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
-                  (w_in - w_in_start);
-        paddle::platform::CudaAtomicAdd(addr, diff_temp);
+__device__ __inline__ void KernelDepthwiseConvFilterGrad(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  T s = 0;
+
+  int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
+  int lid = lane_id();
+
+  for (int image_w = threadIdx.x; image_w < output_width;
+       image_w += blockDim.x) {
+    for (int bid = 0; bid < num; bid++) {
+      for (int image_h = threadIdx.y; image_h < output_height;
+           image_h += blockDim.y) {
+        int kernel_id = blockIdx.z;
+        int kernel_h = blockIdx.y * dilate_height - padding_height;
+        int kernel_w = blockIdx.x * dilate_width - padding_width;
+
+        int image_hk = image_h * stride_height + kernel_h;
+        int image_wk = image_w * stride_width + kernel_w;
+        if (image_hk < 0 || image_hk >= input_height) continue;
+        if (image_wk < 0 || image_wk >= input_width) continue;
+#define gaid(N, C, H, W) \
+  ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
+
+        s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+             input_data[((bid * (gridDim.z / filter_multiplier) +
+                          kernel_id / filter_multiplier) *
+                             input_height +
+                         image_hk) *
+                            input_width +
+                        image_wk];
+
+#undef gaid
      }
    }
  }
+#if __CUDA_ARCH__ >= 530
+  s = warpReduceSum<T>(s);
+  if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+#else
+  paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+#endif
+}
+
+template <typename T, int c_filter_multiplier>
+__global__ void KernelDepthwiseConvFilterGradSp(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConvFilterGrad<T>(
+        output_grad_data, input_data, num, output_channels, output_height,
+        output_width, input_channels, input_height, input_width,
+        filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, filter_grad_data);
+  else
+    KernelDepthwiseConvFilterGrad<T>(
+        output_grad_data, input_data, num, output_channels, output_height,
+        output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, filter_grad_data);
 }

 /*
@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output) {
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];

    const T* input_data = input.data<T>();
    const T* filter_data = filter.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());

-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        output_data);
+    int thread = 512;
+    int blocks = std::min(std::max(thread / output_width, 1), output_height);
+    dim3 threads(std::min(output_width, thread), blocks, 1);
+    dim3 grid(output_channels, batch_size, 1);
+    int filter_multiplier = output_channels / input_channels;
+#define check_case(c_filter_multiplier, c_stride)                            \
+  if (c_filter_multiplier == 0 ||                                            \
+      filter_multiplier == c_filter_multiplier &&                            \
+          stride_height == stride_width && stride_height == c_stride) {      \
+    KernelDepthwiseConvSp<T, c_filter_multiplier,                            \
+                          c_stride><<<grid, threads, 0, context.stream()>>>( \
+        input_data, filter_data, batch_size, output_channels, output_height, \
+        output_width, input_channels, input_height, input_width,             \
+        filter_multiplier, ksize_height, ksize_width, stride_height,         \
+        stride_width, padding_height, padding_width, dilate_height,          \
+        dilate_width, output_data);                                          \
+    return;                                                                  \
+  }
+    check_case(1, 1);
+    check_case(1, 2);
+    // NOTE(liangdun): 0,0 for other case
+    // add other case if needed, e.g. check_case(2^n,1)
+    check_case(0, 0);
+#undef check_case
  }
 };

@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];

    const T* filter_data = filter.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
+    int thread = 512;
+    int blocks = std::min(std::max(thread / input_width, 1), input_height);
+    dim3 threads(std::min(input_width, thread), blocks, 1);
+    dim3 grid(input_channels, batch_size, 1);
+    int filter_multiplier = output_channels / input_channels;
+
+#define check_case(c_filter_multiplier, c_stride)                       \
+  if (c_filter_multiplier == 0 ||                                       \
+      filter_multiplier == c_filter_multiplier &&                       \
+          stride_height == stride_width && stride_height == c_stride) { \
+    KernelDepthwiseConvInputGradSp<                                     \
+        T, c_filter_multiplier,                                         \
+        c_stride><<<grid, threads, 0, context.stream()>>>(              \
+        output_grad_data, filter_data, batch_size, output_channels,     \
+        output_height, output_width, input_channels, input_height,      \
+        input_width, filter_multiplier, ksize_height, ksize_width,      \
+        stride_height, stride_width, padding_height, padding_width,     \
+        dilate_height, dilate_width, input_grad_data);                  \
+    return;                                                             \
+  }
+    check_case(1, 1);
+    check_case(1, 2);
+    // NOTE(liangdun): 0,0 for other case
+    // add other case if needed, e.g. check_case(2^n,1)
+    check_case(0, 0);
+#undef check_case
  }
 };

@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];

    const T* input_data = input.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());

-    int nthreads = batch_size * output_channels * output_height * output_width;
-
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, input_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        filter_grad_data);
+    int block_size = 512;
+    int crop_output_height =
+        std::min(std::max(block_size / output_width, 1), output_height);
+    dim3 grid(ksize_width, ksize_height, output_channels);
+    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
+    int filter_multiplier = output_channels / input_channels;
+
+#define check_case(c_filter_multiplier)                                       \
+  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
+    KernelDepthwiseConvFilterGradSp<                                          \
+        T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>(      \
+        output_grad_data, input_data, batch_size, output_channels,            \
+        output_height, output_width, input_channels, input_height,            \
+        input_width, filter_multiplier, ksize_height, ksize_width,            \
+        stride_height, stride_width, padding_height, padding_width,           \
+        dilate_height, dilate_width, filter_grad_data);                       \
+    return;                                                                   \
+  }
+    check_case(1);
+    check_case(0);
+#undef check_case
  }
 };


--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output);
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations, framework::Tensor* output);
 };

 template <typename DeviceContext, typename T>
@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad);
 };

@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad);
 };


--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -12,17 +12,64 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <vector>
+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_mean_op.h"

-REGISTER_OP_CUDA_KERNEL(reduce_mean,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MeanFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+template <typename T>
+class ReduceMeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        DivideFunctor<T>(reduce_num), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
+                        ops::ReduceMeanKernel<double>,
+                        ops::ReduceMeanKernel<int>,
+                        ops::ReduceMeanKernel<int64_t>);
+
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,

--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -12,17 +12,59 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_sum_op.h"

-REGISTER_OP_CUDA_KERNEL(reduce_sum,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::SumFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+};
+
+template <typename T>
+class ReduceSumKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
+                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
+                        ops::ReduceSumKernel<int64_t>);
+
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
    def setUp(self):
        self.op_type = "conv2d"
        self.use_cudnn = False
+        self.use_cuda = False
        self.use_mkldnn = False
        self.data_format = "AnyLayout"
        self.dtype = np.float32
@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
        }
        self.outputs = {'Output': output}

-    def testcudnn(self):
-        return core.is_compiled_with_cuda() and self.use_cudnn
+    def testcuda(self):
+        return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                 self.use_cuda)

    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_output_with_place(place, atol=1e-5)

    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)

    def test_check_grad_no_filter(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Input'],
            'Output',
@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad_no_input(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Filter'],
            'Output',
@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):

 class TestDepthwiseConv(TestConv2dOp):
    def init_test_case(self):
+        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [2, 2]
        self.input_size = [2, 3, 5, 5]  # NCHW
        self.groups = 3
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
+        self.filter_size = [3, f_c, 3, 3]
        self.op_type = "depthwise_conv2d"


 class TestDepthwiseConv2(TestConv2dOp):
    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [1, 1]
        self.input_size = [2, 3, 5, 5]  # NCHW
@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
        self.op_type = "depthwise_conv2d"


+class TestDepthwiseConvWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation2(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):