提交 1fb82148 编写于 作者: D Dun 提交者: Dang Qingqing

Optimization of Kernels that related to DeepLabv3+ (#13534)

* refine reduce by cub
* optimize KernelDepthwiseConvFilterGrad
* optimize depthwise conv and reduce mean and reduce sum
* fix bug: dilation
* cuda arch and cuda 8 compatible

test=release/1.0.0
上级 8d16de73
......@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)
op_library(reduce_mean_op DEPS cub)
else()
op_library(conv_op DEPS vol2col im2col)
endif()
......
......@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
output);
}
};
......@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad);
paddings, dilations, input_grad);
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad);
dilations, filter_grad);
}
}
};
......
......@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
output);
dilations, output);
}
};
......@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
input_grad);
}
......@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
filter_grad);
dilations, filter_grad);
}
}
};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace detail {
template <typename T, size_t ElementCount>
struct Array {
public:
HOSTDEVICE inline Array() {}
HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
HOSTDEVICE inline const T& operator[](size_t index) const {
return data_[index];
}
HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
template <typename VectorLikeType>
static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
size_t n = static_cast<size_t>(vec.size());
Array<T, ElementCount> ret;
for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
return ret;
}
private:
T data_[ElementCount];
};
// reduce the last axis of 2d array
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim>
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
Array<int, Rank> x_strides,
Array<int, ReduceRank> reduce_dim,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
sub_index[left_dim[i]] = left_idx / left_strides[i];
left_idx %= left_strides[i];
}
int reduce_idx = threadIdx.x;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
int reduce_idx = i;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
}
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
int n = static_cast<int>(dims.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims,
const std::vector<int>& idx) {
int n = static_cast<int>(idx.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[idx[i + 1]];
}
return strides;
}
constexpr int kMaxBlockDim = 512;
static inline int GetDesiredBlockDim(int block_dim) {
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(block_dim)));
}
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void TensorReduceImpl(
const Tx* x_data, Ty* y_data, const platform::Place& place,
const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
BlockDim><<<left_num, BlockDim, 0, stream>>>(
x_data, y_data, reducer, transformer, init, reduce_num);
return;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3););
CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5););
CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
} // namespace detail
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
auto x_dim = framework::vectorize2int(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
for (auto e : origin_reduce_dims) {
auto pos = e >= 0 ? e : e + x_dim.size();
is_reduced |= 1 << e;
}
for (int i = 0; i < x_dim.size(); i++) {
if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
new_x_dim.push_back(x_dim[i]);
if ((is_reduced >> i) & 1)
new_reduce_dims.push_back(new_x_dim.size() - 1);
} else {
new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
}
}
x_dim = new_x_dim;
origin_reduce_dims = new_reduce_dims;
int x_rank = static_cast<int>(x_dim.size());
std::set<int> left_set, reduce_set;
for (int i = 0; i < x_rank; ++i) left_set.insert(i);
for (auto e : origin_reduce_dims) {
left_set.erase(e);
reduce_set.insert(e);
}
std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
std::vector<int> left_dim(left_set.begin(), left_set.end());
std::vector<int> x_strides = detail::GetStrides(x_dim);
std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
int left_num = 1;
if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
std::vector<int> y_dim(left_dim.size());
for (int i = 0; i < left_dim.size(); ++i) {
y_dim[i] = x_dim[left_dim[i]];
}
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (reduce_num == 1) return;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch (detail::GetDesiredBlockDim(reduce_num)) {
CUB_BLOCK_DIM_CASE(512);
CUB_BLOCK_DIM_CASE(256);
CUB_BLOCK_DIM_CASE(128);
CUB_BLOCK_DIM_CASE(64);
CUB_BLOCK_DIM_CASE(32);
CUB_BLOCK_DIM_CASE(16);
CUB_BLOCK_DIM_CASE(8);
CUB_BLOCK_DIM_CASE(4);
CUB_BLOCK_DIM_CASE(2);
}
#undef CUB_BLOCK_DIM_CASE
}
} // namespace operators
} // namespace paddle
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h"
......@@ -20,31 +21,54 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T>
__inline__ __device__ T warpReduceSum(T val) {
#if CUDA_VERSION < 9000
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
#else
#define FULL_MASK 0xffffffff
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
#endif
}
__forceinline__ __device__ unsigned lane_id() {
unsigned ret;
asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned warp_id() {
unsigned ret;
asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template <typename T>
__global__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
__device__ __inline__ void KernelDepthwiseConv(
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / output_channels / output_height / output_width;
const int c_out = (index / output_height / output_width) % output_channels;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width;
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
const int batch = blockIdx.y;
const int c_out = blockIdx.x;
const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width;
const int h_in_end = h_in_start + filter_height * dilate_height;
const int w_in_end = w_in_start + filter_width * dilate_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
......@@ -53,116 +77,212 @@ __global__ void KernelDepthwiseConv(
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
int weight_offset = 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
w_in < w_end) {
const int offset = in_offset + h_in * input_width + w_in;
value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
value += weight[weight_offset] * input_data[offset];
}
weight_offset++;
}
}
int index =
((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
w_out;
output_data[index] = value;
}
}
}
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvSp(
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, filter_multiplier,
filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, c_filter_multiplier,
filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height,
dilate_width, output_data);
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T>
__global__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data,
const T* const filter_data, const int batch_size, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / input_channels / input_height / input_width;
const int c_in = (index / input_height / input_width) % input_channels;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width;
__device__ __inline__ void KernelDepthwiseConvInputGrad(
const T* const output_grad_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
const int batch = blockIdx.y;
const int c_in = blockIdx.x;
const int c_out_start = c_in * filter_multiplier;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
h_in - (filter_height - 1) * dilate_height + padding_height;
int h_out_end = (h_in + padding_height) / stride_height;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int h_out_end = h_in + padding_height;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
w_in - (filter_width - 1) * dilate_width + padding_width;
int w_out_end = (w_in + padding_width) / stride_width;
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
int w_out_end = w_in + padding_width;
T value = 0;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + padding_height - h_out * stride_height;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + padding_width - w_out * stride_width;
const int filter_offset = c_out * filter_height * filter_width +
filter_h * filter_width + filter_w;
int filter_offset = (c_out + 1) * filter_height * filter_width;
for (int h_out = h_out_start; h_out <= h_out_end;
h_out += dilate_height) {
for (int w_out = w_out_start; w_out <= w_out_end;
w_out += dilate_width) {
filter_offset--;
int s_h_out = h_out / stride_height;
int s_w_out = w_out / stride_width;
if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
s_w_out < output_width) {
const int output_grad_offset =
((batch * output_channels + c_out) * output_height + h_out) *
((batch * output_channels + c_out) * output_height +
s_h_out) *
output_width +
w_out;
value +=
output_grad_data[output_grad_offset] * filter_data[filter_offset];
s_w_out;
value += output_grad_data[output_grad_offset] *
filter_data[filter_offset];
}
}
}
}
int index =
((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
w_in;
input_grad_data[index] = value;
}
input_grad_data[index] += value;
}
}
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvInputGradSp(
const T* const output_grad_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, input_grad_data);
else
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
input_grad_data);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T>
__global__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data,
const T* const input_data, const int num, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int w_out = index % output_width;
const int h_out = (index / output_width) % output_height;
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels);
const int c_in = c_out / filter_multiplier;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end =
-padding_height + h_out * stride_height + filter_height;
const int w_in_end = -padding_width + w_out * stride_width + filter_width;
const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + (h_in - h_in_start) * filter_width +
(w_in - w_in_start);
paddle::platform::CudaAtomicAdd(addr, diff_temp);
__device__ __inline__ void KernelDepthwiseConvFilterGrad(
const T* output_grad_data, const T* input_data, const int num,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* filter_grad_data) {
T s = 0;
int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
int lid = lane_id();
for (int image_w = threadIdx.x; image_w < output_width;
image_w += blockDim.x) {
for (int bid = 0; bid < num; bid++) {
for (int image_h = threadIdx.y; image_h < output_height;
image_h += blockDim.y) {
int kernel_id = blockIdx.z;
int kernel_h = blockIdx.y * dilate_height - padding_height;
int kernel_w = blockIdx.x * dilate_width - padding_width;
int image_hk = image_h * stride_height + kernel_h;
int image_wk = image_w * stride_width + kernel_w;
if (image_hk < 0 || image_hk >= input_height) continue;
if (image_wk < 0 || image_wk >= input_width) continue;
#define gaid(N, C, H, W) \
((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
input_data[((bid * (gridDim.z / filter_multiplier) +
kernel_id / filter_multiplier) *
input_height +
image_hk) *
input_width +
image_wk];
#undef gaid
}
}
}
#if __CUDA_ARCH__ >= 530
s = warpReduceSum<T>(s);
if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#else
paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#endif
}
template <typename T, int c_filter_multiplier>
__global__ void KernelDepthwiseConvFilterGradSp(
const T* output_grad_data, const T* input_data, const int num,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* filter_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
else
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
}
/*
......@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) {
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
......@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
output_data);
int thread = 512;
int blocks = std::min(std::max(thread / output_width, 1), output_height);
dim3 threads(std::min(output_width, thread), blocks, 1);
dim3 grid(output_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier, c_stride) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
KernelDepthwiseConvSp<T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
}
};
......@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
......@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
input_grad_data);
int thread = 512;
int blocks = std::min(std::max(thread / input_width, 1), input_height);
dim3 threads(std::min(input_width, thread), blocks, 1);
dim3 grid(input_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier, c_stride) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
}
};
......@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
......@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
filter_grad_data);
int block_size = 512;
int crop_output_height =
std::min(std::max(block_size / output_width, 1), output_height);
dim3 grid(ksize_width, ksize_height, output_channels);
dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier) \
if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case(1);
check_case(0);
#undef check_case
}
};
......
......@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output);
const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
......@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad);
};
......@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad);
};
......
......@@ -12,17 +12,64 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_mean,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MeanFunctor>);
namespace paddle {
namespace operators {
template <typename T>
struct DivideFunctor {
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>,
......
......@@ -12,17 +12,59 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_sum,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::SumFunctor>);
namespace paddle {
namespace operators {
template <typename T>
struct IdentityFunctor {
HOSTDEVICE explicit inline IdentityFunctor() {}
HOSTDEVICE inline T operator()(const T& x) const { return x; }
};
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>,
......
......@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def setUp(self):
self.op_type = "conv2d"
self.use_cudnn = False
self.use_cuda = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
self.dtype = np.float32
......@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
}
self.outputs = {'Output': output}
def testcudnn(self):
return core.is_compiled_with_cuda() and self.use_cudnn
def testcuda(self):
return core.is_compiled_with_cuda() and (self.use_cudnn or
self.use_cuda)
def test_check_output(self):
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self):
if self.dtype == np.float16:
return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place(
place, ['Input'],
'Output',
......@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place(
place, ['Filter'],
'Output',
......@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv3(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
......@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册