提交 161c3e31 编写于 作者: D Dun 提交者: qingqing01

Optimization of Kernels that related to DeepLabv3+ (#13534)

* refine reduce by cub
* optimize KernelDepthwiseConvFilterGrad
* optimize depthwise conv and reduce mean and reduce sum
* fix bug: dilation
* cuda arch and cuda 8 compatible
上级 35b713c3
...@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) ...@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
op_library(reduce_mean_op DEPS cub)
else() else()
op_library(conv_op DEPS vol2col im2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
......
...@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> { ...@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
output);
} }
}; };
...@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> { ...@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0)); set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad); paddings, dilations, input_grad);
} }
if (filter_grad) { if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0)); set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad); dilations, filter_grad);
} }
} }
}; };
......
...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> { ...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math::DepthwiseConvInputGradFunctor<DeviceContext, T> math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad; depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
output); dilations, output);
} }
}; };
...@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
if (input_grad) { if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
input_grad); input_grad);
} }
...@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T> math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad; depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
filter_grad); dilations, filter_grad);
} }
} }
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace detail {
template <typename T, size_t ElementCount>
struct Array {
public:
HOSTDEVICE inline Array() {}
HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
HOSTDEVICE inline const T& operator[](size_t index) const {
return data_[index];
}
HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
template <typename VectorLikeType>
static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
size_t n = static_cast<size_t>(vec.size());
Array<T, ElementCount> ret;
for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
return ret;
}
private:
T data_[ElementCount];
};
// reduce the last axis of 2d array
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim>
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
Array<int, Rank> x_strides,
Array<int, ReduceRank> reduce_dim,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
sub_index[left_dim[i]] = left_idx / left_strides[i];
left_idx %= left_strides[i];
}
int reduce_idx = threadIdx.x;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
int reduce_idx = i;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
}
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
int n = static_cast<int>(dims.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims,
const std::vector<int>& idx) {
int n = static_cast<int>(idx.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[idx[i + 1]];
}
return strides;
}
constexpr int kMaxBlockDim = 512;
static inline int GetDesiredBlockDim(int block_dim) {
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(block_dim)));
}
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void TensorReduceImpl(
const Tx* x_data, Ty* y_data, const platform::Place& place,
const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
BlockDim><<<left_num, BlockDim, 0, stream>>>(
x_data, y_data, reducer, transformer, init, reduce_num);
return;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3););
CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5););
CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
} // namespace detail
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
auto x_dim = framework::vectorize2int(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
for (auto e : origin_reduce_dims) {
auto pos = e >= 0 ? e : e + x_dim.size();
is_reduced |= 1 << e;
}
for (int i = 0; i < x_dim.size(); i++) {
if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
new_x_dim.push_back(x_dim[i]);
if ((is_reduced >> i) & 1)
new_reduce_dims.push_back(new_x_dim.size() - 1);
} else {
new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
}
}
x_dim = new_x_dim;
origin_reduce_dims = new_reduce_dims;
int x_rank = static_cast<int>(x_dim.size());
std::set<int> left_set, reduce_set;
for (int i = 0; i < x_rank; ++i) left_set.insert(i);
for (auto e : origin_reduce_dims) {
left_set.erase(e);
reduce_set.insert(e);
}
std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
std::vector<int> left_dim(left_set.begin(), left_set.end());
std::vector<int> x_strides = detail::GetStrides(x_dim);
std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
int left_num = 1;
if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
std::vector<int> y_dim(left_dim.size());
for (int i = 0; i < left_dim.size(); ++i) {
y_dim[i] = x_dim[left_dim[i]];
}
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (reduce_num == 1) return;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch (detail::GetDesiredBlockDim(reduce_num)) {
CUB_BLOCK_DIM_CASE(512);
CUB_BLOCK_DIM_CASE(256);
CUB_BLOCK_DIM_CASE(128);
CUB_BLOCK_DIM_CASE(64);
CUB_BLOCK_DIM_CASE(32);
CUB_BLOCK_DIM_CASE(16);
CUB_BLOCK_DIM_CASE(8);
CUB_BLOCK_DIM_CASE(4);
CUB_BLOCK_DIM_CASE(2);
}
#undef CUB_BLOCK_DIM_CASE
}
} // namespace operators
} // namespace paddle
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -20,149 +21,268 @@ namespace paddle { ...@@ -20,149 +21,268 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
__inline__ __device__ T warpReduceSum(T val) {
#if CUDA_VERSION < 9000
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
#else
#define FULL_MASK 0xffffffff
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
#endif
}
__forceinline__ __device__ unsigned lane_id() {
unsigned ret;
asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned warp_id() {
unsigned ret;
asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
// A Cuda kernel to compute the depthwise convolution forward pass // A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format. // in NCHW format.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConv( __device__ __inline__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data, const T* const input_data, const T* const filter_data, const int batch_size,
const int batch_size, const int output_channels, const int output_height, const int output_channels, const int output_height, const int output_width,
const int output_width, const int input_channels, const int input_height, const int input_channels, const int input_height, const int input_width,
const int input_width, const int filter_multiplier, const int filter_height, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width, const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) { const int padding_height, const int padding_width, const int dilate_height,
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int dilate_width, T* const output_data) {
for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
if (index < nthreads) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
const int batch = index / output_channels / output_height / output_width; const int batch = blockIdx.y;
const int c_out = (index / output_height / output_width) % output_channels; const int c_out = blockIdx.x;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width; const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
const int c_in = c_out / filter_multiplier; T value = 0;
const T* weight = filter_data + c_out * filter_height * filter_width; const int h_in_start = -padding_height + h_out * stride_height;
T value = 0; const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_start = -padding_height + h_out * stride_height; const int h_in_end = h_in_start + filter_height * dilate_height;
const int w_in_start = -padding_width + w_out * stride_width; const int w_in_end = w_in_start + filter_width * dilate_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width; const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width; const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height; const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_end = w_in_end < input_width ? w_in_end : input_width; const int w_start = w_in_start > 0 ? w_in_start : 0;
const int h_start = h_in_start > 0 ? h_in_start : 0; int weight_offset = 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
for (int h_in = h_start; h_in < h_end; h_in++) { for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
for (int w_in = w_start; w_in < w_end; w_in++) { if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
const int offset = in_offset + h_in * input_width + w_in; w_in < w_end) {
value += const int offset = in_offset + h_in * input_width + w_in;
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * value += weight[weight_offset] * input_data[offset];
input_data[offset]; }
weight_offset++;
}
} }
int index =
((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
w_out;
output_data[index] = value;
} }
output_data[index] = value;
} }
} }
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvSp(
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, filter_multiplier,
filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, c_filter_multiplier,
filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height,
dilate_width, output_data);
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConvInputGrad( __device__ __inline__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data, const T* const output_grad_data, const T* const filter_data,
const T* const filter_data, const int batch_size, const int output_channels, const int batch_size, const int output_channels, const int output_height,
const int output_height, const int output_width, const int input_channels, const int output_width, const int input_channels, const int input_height,
const int input_height, const int input_width, const int filter_multiplier, const int input_width, const int filter_multiplier, const int filter_height,
const int filter_height, const int filter_width, const int stride_height, const int filter_width, const int stride_height, const int stride_width,
const int stride_width, const int padding_height, const int padding_width, const int padding_height, const int padding_width, const int dilate_height,
T* const input_grad_data) { const int dilate_width, T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
if (index < nthreads) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
const int batch = index / input_channels / input_height / input_width; const int batch = blockIdx.y;
const int c_in = (index / input_height / input_width) % input_channels; const int c_in = blockIdx.x;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width; const int c_out_start = c_in * filter_multiplier;
const int c_out_start = c_in * filter_multiplier; int h_out_start =
h_in - (filter_height - 1) * dilate_height + padding_height;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height; int h_out_end = h_in + padding_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int w_out_start =
int h_out_end = (h_in + padding_height) / stride_height; w_in - (filter_width - 1) * dilate_width + padding_width;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_end = w_in + padding_width;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width; T value = 0;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
int w_out_end = (w_in + padding_width) / stride_width; c_out++) {
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; int filter_offset = (c_out + 1) * filter_height * filter_width;
for (int h_out = h_out_start; h_out <= h_out_end;
T value = 0; h_out += dilate_height) {
for (int w_out = w_out_start; w_out <= w_out_end;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; w_out += dilate_width) {
c_out++) { filter_offset--;
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { int s_h_out = h_out / stride_height;
const int filter_h = h_in + padding_height - h_out * stride_height; int s_w_out = w_out / stride_width;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
const int filter_w = w_in + padding_width - w_out * stride_width; s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
const int filter_offset = c_out * filter_height * filter_width + s_w_out < output_width) {
filter_h * filter_width + filter_w; const int output_grad_offset =
const int output_grad_offset = ((batch * output_channels + c_out) * output_height +
((batch * output_channels + c_out) * output_height + h_out) * s_h_out) *
output_width + output_width +
w_out; s_w_out;
value += value += output_grad_data[output_grad_offset] *
output_grad_data[output_grad_offset] * filter_data[filter_offset]; filter_data[filter_offset];
}
}
} }
} }
int index =
((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
w_in;
input_grad_data[index] = value;
} }
input_grad_data[index] += value;
} }
} }
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvInputGradSp(
const T* const output_grad_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, input_grad_data);
else
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
input_grad_data);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConvFilterGrad( __device__ __inline__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data, const T* output_grad_data, const T* input_data, const int num,
const T* const input_data, const int num, const int output_channels, const int output_channels, const int output_height, const int output_width,
const int output_height, const int output_width, const int input_channels, const int input_channels, const int input_height, const int input_width,
const int input_height, const int input_width, const int filter_multiplier, const int filter_multiplier, const int filter_height,
const int filter_height, const int filter_width, const int stride_height, const int filter_width, const int stride_height, const int stride_width,
const int stride_width, const int padding_height, const int padding_width, const int padding_height, const int padding_width, const int dilate_height,
T* const filter_grad_data) { const int dilate_width, T* filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; T s = 0;
if (index < nthreads) {
const int w_out = index % output_width; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
const int h_out = (index / output_width) % output_height; int lid = lane_id();
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels); for (int image_w = threadIdx.x; image_w < output_width;
const int c_in = c_out / filter_multiplier; image_w += blockDim.x) {
const int h_in_start = -padding_height + h_out * stride_height; for (int bid = 0; bid < num; bid++) {
const int w_in_start = -padding_width + w_out * stride_width; for (int image_h = threadIdx.y; image_h < output_height;
const int h_in_end = image_h += blockDim.y) {
-padding_height + h_out * stride_height + filter_height; int kernel_id = blockIdx.z;
const int w_in_end = -padding_width + w_out * stride_width + filter_width; int kernel_h = blockIdx.y * dilate_height - padding_height;
const int in_offset = int kernel_w = blockIdx.x * dilate_width - padding_width;
(batch * input_channels + c_in) * input_height * input_width;
int image_hk = image_h * stride_height + kernel_h;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; int image_wk = image_w * stride_width + kernel_w;
const int h_end = h_in_end < input_height ? h_in_end : input_height; if (image_hk < 0 || image_hk >= input_height) continue;
const int w_end = w_in_end < input_width ? w_in_end : input_width; if (image_wk < 0 || image_wk >= input_width) continue;
const int h_start = h_in_start > 0 ? h_in_start : 0; #define gaid(N, C, H, W) \
const int w_start = w_in_start > 0 ? w_in_start : 0; ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
for (int h_in = h_start; h_in < h_end; h_in++) { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
for (int w_in = w_start; w_in < w_end; w_in++) { input_data[((bid * (gridDim.z / filter_multiplier) +
const int offset = in_offset + h_in * input_width + w_in; kernel_id / filter_multiplier) *
const T diff_temp = output_grad_data[index] * input_data[offset]; input_height +
T* addr = addr_offset + (h_in - h_in_start) * filter_width + image_hk) *
(w_in - w_in_start); input_width +
paddle::platform::CudaAtomicAdd(addr, diff_temp); image_wk];
#undef gaid
} }
} }
} }
#if __CUDA_ARCH__ >= 530
s = warpReduceSum<T>(s);
if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#else
paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#endif
}
template <typename T, int c_filter_multiplier>
__global__ void KernelDepthwiseConvFilterGradSp(
const T* output_grad_data, const T* input_data, const int num,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* filter_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
else
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
} }
/* /*
...@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) { const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width; int thread = 512;
int blocks = (nthreads + 1024 - 1) / 1024; int blocks = std::min(std::max(thread / output_width, 1), output_height);
dim3 threads(1024, 1); dim3 threads(std::min(output_width, thread), blocks, 1);
dim3 grid(blocks, 1); dim3 grid(output_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>( #define check_case(c_filter_multiplier, c_stride) \
nthreads, input_data, filter_data, batch_size, output_channels, if (c_filter_multiplier == 0 || \
output_height, output_width, input_channels, input_height, input_width, filter_multiplier == c_filter_multiplier && \
output_channels / input_channels, ksize_height, ksize_width, stride_height == stride_width && stride_height == c_stride) { \
stride_height, stride_width, padding_height, padding_width, KernelDepthwiseConvSp<T, c_filter_multiplier, \
output_data); c_stride><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width; int thread = 512;
int blocks = (nthreads + 1024 - 1) / 1024; int blocks = std::min(std::max(thread / input_width, 1), input_height);
dim3 threads(1024, 1); dim3 threads(std::min(input_width, thread), blocks, 1);
dim3 grid(blocks, 1); dim3 grid(input_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels, #define check_case(c_filter_multiplier, c_stride) \
output_height, output_width, input_channels, input_height, input_width, if (c_filter_multiplier == 0 || \
output_channels / input_channels, ksize_height, ksize_width, filter_multiplier == c_filter_multiplier && \
stride_height, stride_width, padding_height, padding_width, stride_height == stride_width && stride_height == c_stride) { \
input_grad_data); KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad) { framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width; int block_size = 512;
int crop_output_height =
int blocks = (nthreads + 1024 - 1) / 1024; std::min(std::max(block_size / output_width, 1), output_height);
dim3 threads(1024, 1); dim3 grid(ksize_width, ksize_height, output_channels);
dim3 grid(blocks, 1); dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels, #define check_case(c_filter_multiplier) \
output_height, output_width, input_channels, input_height, input_width, if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
output_channels / input_channels, ksize_height, ksize_width, KernelDepthwiseConvFilterGradSp< \
stride_height, stride_width, padding_height, padding_width, T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \
filter_grad_data); output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case(1);
check_case(0);
#undef check_case
} }
}; };
......
...@@ -32,7 +32,8 @@ class DepthwiseConvFunctor { ...@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output); const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor { ...@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor { ...@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad); framework::Tensor* filter_grad);
}; };
......
...@@ -12,17 +12,64 @@ ...@@ -12,17 +12,64 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h" #include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_mean, namespace paddle {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, namespace operators {
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, template <typename T>
double, ops::MeanFunctor>, struct DivideFunctor {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
int64_t, ops::MeanFunctor>);
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>, float, ops::MeanGradFunctor>,
......
...@@ -12,17 +12,59 @@ ...@@ -12,17 +12,59 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h" #include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_sum, namespace paddle {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, namespace operators {
float, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, template <typename T>
double, ops::SumFunctor>, struct IdentityFunctor {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE explicit inline IdentityFunctor() {}
int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE inline T operator()(const T& x) const { return x; }
int64_t, ops::SumFunctor>); };
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>, float, ops::SumGradFunctor>,
......
...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): ...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d" self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
self.dtype = np.float32 self.dtype = np.float32
...@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest): ...@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
def testcudnn(self): def testcuda(self):
return core.is_compiled_with_cuda() and self.use_cudnn return core.is_compiled_with_cuda() and (self.use_cudnn or
self.use_cuda)
def test_check_output(self): def test_check_output(self):
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5) self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Input'], place, ['Input'],
'Output', 'Output',
...@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest): ...@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Filter'], place, ['Filter'],
'Output', 'Output',
...@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): ...@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class TestDepthwiseConv(TestConv2dOp): class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [2, 2] self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp): class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv3(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [1, 1] self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
...@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# Please Don't remove the following code. # Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv. # Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册