From effdae160415040bea351db3f111a7320d41d871 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 8 Oct 2018 10:28:08 +0800 Subject: [PATCH] Revert "Optimization of Kernels that related to DeepLabv3+ (#13534)" --- paddle/fluid/operators/CMakeLists.txt | 1 - paddle/fluid/operators/conv_op.h | 7 +- paddle/fluid/operators/conv_transpose_op.h | 7 +- paddle/fluid/operators/cub_reduce.h | 322 ------------ paddle/fluid/operators/math/depthwise_conv.cu | 479 ++++++------------ paddle/fluid/operators/math/depthwise_conv.h | 5 +- paddle/fluid/operators/reduce_mean_op.cu | 65 +-- paddle/fluid/operators/reduce_sum_op.cu | 60 +-- .../fluid/tests/unittests/test_conv2d_op.py | 59 +-- 9 files changed, 188 insertions(+), 817 deletions(-) delete mode 100644 paddle/fluid/operators/cub_reduce.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b61bca8c3d..fa41266d62 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) - op_library(reduce_mean_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index ef76106f17..b3140116df 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel { math::DepthwiseConvFunctor depthwiseConv; auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); } }; @@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad); + paddings, input_grad); } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - dilations, filter_grad); + filter_grad); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 88c578b141..0d9c6a62fe 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, - dilations, output); + output); } }; @@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, input_grad); } @@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, - dilations, filter_grad); + filter_grad); } } }; diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h deleted file mode 100644 index 16fdad775f..0000000000 --- a/paddle/fluid/operators/cub_reduce.h +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include // NOLINT -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -namespace detail { -template -struct Array { - public: - HOSTDEVICE inline Array() {} - - HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; } - - HOSTDEVICE inline const T& operator[](size_t index) const { - return data_[index]; - } - - HOSTDEVICE constexpr inline size_t size() const { return ElementCount; } - - template - static inline Array From(const VectorLikeType& vec) { - PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match"); - size_t n = static_cast(vec.size()); - Array ret; - for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; - return ret; - } - - private: - T data_[ElementCount]; -}; - -// reduce the last axis of 2d array -template -__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, - int reduce_num) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; - int idx_x = blockIdx.x * reduce_num; - int idx_y = threadIdx.x; - Ty reduce_var = init; - for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) - reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y])); - - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); - - if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; - } -} - -template -__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, - Array x_strides, - Array reduce_dim, - Array reduce_strides, - Array left_dim, - Array left_strides) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; - Array sub_index; - int left_idx = blockIdx.x; - for (int i = 0; i < Rank - ReduceRank; ++i) { - sub_index[left_dim[i]] = left_idx / left_strides[i]; - left_idx %= left_strides[i]; - } - - int reduce_idx = threadIdx.x; - for (int j = 0; j < ReduceRank; ++j) { - sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; - reduce_idx %= reduce_strides[j]; - } - - int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - Ty reduce_var = static_cast(transformer(x[idx_x])); - - for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { - int reduce_idx = i; - for (int j = 0; j < ReduceRank; ++j) { - sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; - reduce_idx %= reduce_strides[j]; - } - - int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - reduce_var = static_cast(reducer(reduce_var, transformer(x[idx_x]))); - } - - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); - - if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; - } -} - -static inline std::vector GetStrides(const std::vector& dims) { - int n = static_cast(dims.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[i + 1]; - } - return strides; -} - -static inline std::vector GetStrides(const std::vector& dims, - const std::vector& idx) { - int n = static_cast(idx.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[idx[i + 1]]; - } - return strides; -} - -constexpr int kMaxBlockDim = 512; - -static inline int GetDesiredBlockDim(int block_dim) { - return block_dim >= kMaxBlockDim - ? kMaxBlockDim - : (1 << static_cast(std::log2(block_dim))); -} - -template -static void TensorReduceImpl( - const Tx* x_data, Ty* y_data, const platform::Place& place, - const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, - int left_num, int reduce_num, const std::vector& x_strides, - const std::vector& reduce_dim, const std::vector& reduce_strides, - const std::vector& left_dim, const std::vector& left_strides, - cudaStream_t stream) { -#define CUB_RANK_CASE(i, ...) \ - case i: { \ - constexpr auto kRank = i; \ - switch (reduce_rank) { __VA_ARGS__; } \ - } break - -#define CUB_REDUCE_RANK_CASE(i, ...) \ - case i: { \ - constexpr auto kReduceRank = i; \ - ReduceKernel<<>>( \ - x_data, y_data, reducer, transformer, init, reduce_num, \ - Array::From(x_strides), \ - Array::From(reduce_dim), \ - Array::From(reduce_strides), \ - Array::From(left_dim), \ - Array::From(left_strides)); \ - } break - - int rank = x_strides.size(); - int reduce_rank = reduce_strides.size(); - if (rank == reduce_rank) { - cub::TransformInputIterator trans_x( - x_data, transformer); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); - framework::Tensor tmp; - auto* temp_storage = tmp.mutable_data( - framework::make_ddim({static_cast(temp_storage_bytes)}), - place); - cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); - return; - } - if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { - ReduceKernel2D<<>>( - x_data, y_data, reducer, transformer, init, reduce_num); - return; - } - /* - if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) { - // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced. - // Currently, it is handled by code below, but inefficient - return; - } - */ - - switch (rank) { - CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); - - CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);); - - CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3);); - - CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);); - - CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5);); - - CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); - - CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); - - CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6); - CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8);); - } - -#undef CUB_REDUCE_RANK_CASE -#undef CUB_RANK_CASE -} - -} // namespace detail - -template -void TensorReduce(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, const Ty& init, - const ReduceOp& reducer, const TransformOp& transformer, - cudaStream_t stream) { - auto x_dim = framework::vectorize2int(x.dims()); - std::vector new_x_dim, new_reduce_dims; - int is_reduced = 0; - for (auto e : origin_reduce_dims) { - auto pos = e >= 0 ? e : e + x_dim.size(); - is_reduced |= 1 << e; - } - for (int i = 0; i < x_dim.size(); i++) { - if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { - new_x_dim.push_back(x_dim[i]); - if ((is_reduced >> i) & 1) - new_reduce_dims.push_back(new_x_dim.size() - 1); - } else { - new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; - } - } - x_dim = new_x_dim; - origin_reduce_dims = new_reduce_dims; - int x_rank = static_cast(x_dim.size()); - std::set left_set, reduce_set; - for (int i = 0; i < x_rank; ++i) left_set.insert(i); - - for (auto e : origin_reduce_dims) { - left_set.erase(e); - reduce_set.insert(e); - } - - std::vector reduce_dim(reduce_set.begin(), reduce_set.end()); - std::vector left_dim(left_set.begin(), left_set.end()); - - std::vector x_strides = detail::GetStrides(x_dim); - std::vector reduce_strides = detail::GetStrides(x_dim, reduce_dim); - std::vector left_strides = detail::GetStrides(x_dim, left_dim); - int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; - int left_num = 1; - if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]]; - - std::vector y_dim(left_dim.size()); - for (int i = 0; i < left_dim.size(); ++i) { - y_dim[i] = x_dim[left_dim[i]]; - } - auto x_data = x.data(); - auto y_data = y->mutable_data(x.place()); - if (reduce_num == 1) return; - -#define CUB_BLOCK_DIM_CASE(block_dim) \ - case block_dim: { \ - constexpr auto kBlockDim = block_dim; \ - detail::TensorReduceImpl( \ - x_data, y_data, x.place(), reducer, transformer, init, left_num, \ - reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \ - left_strides, stream); \ - } break - - switch (detail::GetDesiredBlockDim(reduce_num)) { - CUB_BLOCK_DIM_CASE(512); - CUB_BLOCK_DIM_CASE(256); - CUB_BLOCK_DIM_CASE(128); - CUB_BLOCK_DIM_CASE(64); - CUB_BLOCK_DIM_CASE(32); - CUB_BLOCK_DIM_CASE(16); - CUB_BLOCK_DIM_CASE(8); - CUB_BLOCK_DIM_CASE(4); - CUB_BLOCK_DIM_CASE(2); - } -#undef CUB_BLOCK_DIM_CASE -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 3be3899123..027e2de48d 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -21,268 +20,149 @@ namespace paddle { namespace operators { namespace math { -template -__inline__ __device__ T warpReduceSum(T val) { -#if CUDA_VERSION < 9000 - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down(val, offset); - return val; -#else -#define FULL_MASK 0xffffffff - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down_sync(FULL_MASK, val, offset); - return val; -#endif -} -__forceinline__ __device__ unsigned lane_id() { - unsigned ret; - asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); - return ret; -} - -__forceinline__ __device__ unsigned warp_id() { - unsigned ret; - asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); - return ret; -} - // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__device__ __inline__ void KernelDepthwiseConv( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { - for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { - const int batch = blockIdx.y; - const int c_out = blockIdx.x; - - const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height * dilate_height; - const int w_in_end = w_in_start + filter_width * dilate_width; - - const int in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - int weight_offset = 0; - - for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { - for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { - if (h_in >= h_start && h_in < h_end && w_in >= w_start && - w_in < w_end) { - const int offset = in_offset + h_in * input_width + w_in; - value += weight[weight_offset] * input_data[offset]; - } - weight_offset++; - } + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; } - int index = - ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + - w_out; - output_data[index] = value; } + output_data[index] = value; } } -template -__global__ void KernelDepthwiseConvSp( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, filter_multiplier, - filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, - dilate_height, dilate_width, output_data); - - else - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, - filter_height, filter_height, c_stride, c_stride, - padding_height, padding_width, dilate_height, - dilate_width, output_data); -} - // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__device__ __inline__ void KernelDepthwiseConvInputGrad( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { - for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { - for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { - const int batch = blockIdx.y; - const int c_in = blockIdx.x; - - const int c_out_start = c_in * filter_multiplier; - - int h_out_start = - h_in - (filter_height - 1) * dilate_height + padding_height; - - int h_out_end = h_in + padding_height; - - int w_out_start = - w_in - (filter_width - 1) * dilate_width + padding_width; - - int w_out_end = w_in + padding_width; - - T value = 0; - - for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; - c_out++) { - int filter_offset = (c_out + 1) * filter_height * filter_width; - for (int h_out = h_out_start; h_out <= h_out_end; - h_out += dilate_height) { - for (int w_out = w_out_start; w_out <= w_out_end; - w_out += dilate_width) { - filter_offset--; - int s_h_out = h_out / stride_height; - int s_w_out = w_out / stride_width; - if (h_out % stride_height == 0 && w_out % stride_width == 0 && - s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && - s_w_out < output_width) { - const int output_grad_offset = - ((batch * output_channels + c_out) * output_height + - s_h_out) * - output_width + - s_w_out; - value += output_grad_data[output_grad_offset] * - filter_data[filter_offset]; - } - } +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; } } - int index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; - input_grad_data[index] = value; } + input_grad_data[index] += value; } } -template -__global__ void KernelDepthwiseConvInputGradSp( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, input_grad_data); - else - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, - padding_height, padding_width, dilate_height, dilate_width, - input_grad_data); -} - // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__device__ __inline__ void KernelDepthwiseConvFilterGrad( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { - T s = 0; - - int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; - int lid = lane_id(); - - for (int image_w = threadIdx.x; image_w < output_width; - image_w += blockDim.x) { - for (int bid = 0; bid < num; bid++) { - for (int image_h = threadIdx.y; image_h < output_height; - image_h += blockDim.y) { - int kernel_id = blockIdx.z; - int kernel_h = blockIdx.y * dilate_height - padding_height; - int kernel_w = blockIdx.x * dilate_width - padding_width; - - int image_hk = image_h * stride_height + kernel_h; - int image_wk = image_w * stride_width + kernel_w; - if (image_hk < 0 || image_hk >= input_height) continue; - if (image_wk < 0 || image_wk >= input_width) continue; -#define gaid(N, C, H, W) \ - ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) - - s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - input_data[((bid * (gridDim.z / filter_multiplier) + - kernel_id / filter_multiplier) * - input_height + - image_hk) * - input_width + - image_wk]; - -#undef gaid +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); } } } -#if __CUDA_ARCH__ >= 530 - s = warpReduceSum(s); - if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#else - paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#endif -} - -template -__global__ void KernelDepthwiseConvFilterGradSp( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data); - else - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data); } /* @@ -297,9 +177,7 @@ class DepthwiseConvFunctor { const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* output) { + const std::vector& paddings, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -313,37 +191,22 @@ class DepthwiseConvFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* filter_data = filter.data(); T* output_data = output->mutable_data(context.GetPlace()); - int thread = 512; - int blocks = std::min(std::max(thread / output_width, 1), output_height); - dim3 threads(std::min(output_width, thread), blocks, 1); - dim3 grid(output_channels, batch_size, 1); - int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ - if (c_filter_multiplier == 0 || \ - filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvSp<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - return; \ - } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); -#undef check_case + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); } }; @@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int thread = 512; - int blocks = std::min(std::max(thread / input_width, 1), input_height); - dim3 threads(std::min(input_width, thread), blocks, 1); - dim3 grid(input_channels, batch_size, 1); - int filter_multiplier = output_channels / input_channels; - -#define check_case(c_filter_multiplier, c_stride) \ - if (c_filter_multiplier == 0 || \ - filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, \ - c_stride><<>>( \ - output_grad_data, filter_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, input_grad_data); \ - return; \ - } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); -#undef check_case + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); } }; @@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* filter_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* output_grad_data = output_grad.data(); T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); - int block_size = 512; - int crop_output_height = - std::min(std::max(block_size / output_width, 1), output_height); - dim3 grid(ksize_width, ksize_height, output_channels); - dim3 threads(std::min(output_width, block_size), crop_output_height, 1); - int filter_multiplier = output_channels / input_channels; - -#define check_case(c_filter_multiplier) \ - if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ - KernelDepthwiseConvFilterGradSp< \ - T, c_filter_multiplier><<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ - return; \ - } - check_case(1); - check_case(0); -#undef check_case + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 71f6fcb23d..97aec40188 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -32,8 +32,7 @@ class DepthwiseConvFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output); + const std::vector& paddings, framework::Tensor* output); }; template @@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* input_grad); }; @@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* filter_grad); }; diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 59b3024483..960cb3235b 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -12,64 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_mean_op.h" -namespace paddle { -namespace operators { - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -template -class ReduceMeanKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - DivideFunctor(reduce_num), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel); - +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); REGISTER_OP_CUDA_KERNEL( reduce_mean_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index 53cd9e9419..f2e16955a5 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -12,59 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_sum_op.h" -namespace paddle { -namespace operators { - -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor() {} - - HOSTDEVICE inline T operator()(const T& x) const { return x; } -}; - -template -class ReduceSumKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, - ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel); - +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, ops::ReduceGradKernel, diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 2ecc2504a8..6a2732e939 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -67,7 +67,6 @@ class TestConv2dOp(OpTest): def setUp(self): self.op_type = "conv2d" self.use_cudnn = False - self.use_cuda = False self.use_mkldnn = False self.data_format = "AnyLayout" self.dtype = np.float32 @@ -102,25 +101,24 @@ class TestConv2dOp(OpTest): } self.outputs = {'Output': output} - def testcuda(self): - return core.is_compiled_with_cuda() and (self.use_cudnn or - self.use_cuda) + def testcudnn(self): + return core.is_compiled_with_cuda() and self.use_cudnn def test_check_output(self): - place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() self.check_output_with_place(place, atol=1e-5) def test_check_grad(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() self.check_grad_with_place( place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def test_check_grad_no_filter(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() self.check_grad_with_place( place, ['Input'], 'Output', @@ -130,7 +128,7 @@ class TestConv2dOp(OpTest): def test_check_grad_no_input(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() self.check_grad_with_place( place, ['Filter'], 'Output', @@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): class TestDepthwiseConv(TestConv2dOp): def init_test_case(self): - self.use_cuda = True self.pad = [1, 1] self.stride = [2, 2] self.input_size = [2, 3, 5, 5] # NCHW self.groups = 3 assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups - self.filter_size = [3, f_c, 3, 3] - self.op_type = "depthwise_conv2d" - - -class TestDepthwiseConv2(TestConv2dOp): - def init_test_case(self): - self.use_cuda = True - self.pad = [1, 1] - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - self.groups = 3 - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [3, f_c, 3, 3] - self.op_type = "depthwise_conv2d" - - -class TestDepthwiseConv3(TestConv2dOp): - def init_test_case(self): - self.use_cuda = True - self.pad = [1, 1] - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - self.groups = 3 - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] self.op_type = "depthwise_conv2d" -class TestDepthwiseConvWithDilation(TestConv2dOp): - def init_test_case(self): - self.use_cuda = True - self.pad = [1, 1] - self.stride = [2, 2] - self.input_size = [2, 3, 5, 5] # NCHW - self.groups = 3 - self.dilations = [2, 2] - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - self.op_type = "depthwise_conv2d" - - -class TestDepthwiseConvWithDilation2(TestConv2dOp): +class TestDepthwiseConv2(TestConv2dOp): def init_test_case(self): - self.use_cuda = True self.pad = [1, 1] self.stride = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW self.groups = 3 - self.dilations = [2, 2] assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] -- GitLab