From a06bec128743d7b036636e53eaefbfe989f0d319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Mon, 27 Nov 2017 11:55:56 +0800 Subject: [PATCH] Conv cudnn 3d (#5783) * conv cudnn 3d * update test case * update * update * follow comments and remove groups from helper * update * refine * update * follow comments2 * update * fix compile --- paddle/operators/CMakeLists.txt | 7 + paddle/operators/conv_cudnn_op.cc | 41 +++++- paddle/operators/conv_cudnn_op.cu.cc | 129 +++++++++++++----- paddle/platform/cudnn_helper.h | 5 +- .../paddle/v2/fluid/tests/test_conv2d_op.py | 14 +- .../paddle/v2/fluid/tests/test_conv3d_op.py | 26 ++++ 6 files changed, 170 insertions(+), 52 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 7ab09b6c65..05d4ea2606 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -73,6 +73,13 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(conv2d);\n") endif() + # conv_cudnn_op contains several operators + if ("${TARGET}" STREQUAL "conv_cudnn_op") + set(pybind_flag 1) + # It's enough to just adding one operator to pybind + file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n") + endif() + # pool_op contains several operators if ("${TARGET}" STREQUAL "pool_op") set(pybind_flag 1) diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc index c03dc3e4fb..0dd8c13b2a 100644 --- a/paddle/operators/conv_cudnn_op.cc +++ b/paddle/operators/conv_cudnn_op.cc @@ -17,10 +17,10 @@ namespace paddle { namespace operators { -class CudnnConvOpMaker : public Conv2DOpMaker { +class CudnnConv2DOpMaker : public Conv2DOpMaker { public: - CudnnConvOpMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) + CudnnConv2DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) : Conv2DOpMaker(proto, op_checker) { AddAttr("workspace_size_MB", "workspace size for cudnn, in MB, " @@ -32,16 +32,43 @@ class CudnnConvOpMaker : public Conv2DOpMaker { } }; +class CudnnConv3DOpMaker : public Conv3DOpMaker { + public: + CudnnConv3DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : Conv3DOpMaker(proto, op_checker) { + AddAttr("workspace_size_MB", + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad, - ops::ConvOpGrad); +REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker, + conv2d_cudnn_grad, ops::ConvOpGrad); + +REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker, + conv3d_cudnn_grad, ops::ConvOpGrad); + +REGISTER_OP_CPU_KERNEL(conv2d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_cudnn_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv_cudnn, +REGISTER_OP_CPU_KERNEL(conv3d_cudnn, ops::GemmConvKernel, ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv_cudnn_grad, ops::GemmConvGradKernel, + conv3d_cudnn_grad, + ops::GemmConvGradKernel, ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 5eaf6b3370..a9763d4248 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -56,6 +56,21 @@ class CudnnConvOpKernel : public framework::OpKernel { ScopedFilterDescriptor filter_desc; ScopedConvolutionDescriptor conv_desc; DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 0) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims()), groups); @@ -63,19 +78,34 @@ class CudnnConvOpKernel : public framework::OpKernel { layout, framework::vectorize2int(output->dims()), groups); cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); - cudnnConvolutionDescriptor_t cudnn_conv_desc = - conv_desc.descriptor(paddings, strides, dilations); int input_channels = input->dims()[1]; - int input_height = input->dims()[2]; - int input_width = input->dims()[3]; - int output_channels = output->dims()[1]; - int output_height = output->dims()[2]; - int output_width = output->dims()[3]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + int output_channels = filter->dims()[0]; + int output_height, output_width, output_depth; + if (output->dims().size() == 5) { + output_depth = output->dims()[2]; + output_height = output->dims()[3]; + output_width = output->dims()[4]; + } else { + output_depth = 1; + output_height = output->dims()[2]; + output_width = output->dims()[3]; + } - int group_offset_in = input_channels / groups * input_height * input_width; + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; int group_offset_out = - output_channels / groups * output_height * output_width; + output_channels / groups * output_height * output_width * output_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn conv workspace --------------------- void* cudnn_workspace = nullptr; @@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor input_desc; ScopedTensorDescriptor output_grad_desc; - ScopedTensorDescriptor input_grad_desc; ScopedFilterDescriptor filter_desc; ScopedFilterDescriptor filter_grad_desc; ScopedConvolutionDescriptor conv_desc; DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 0) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims()), groups); @@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel { layout, framework::vectorize2int(output_grad->dims()), groups); cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); - cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr; - cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr; - - cudnnConvolutionDescriptor_t cudnn_conv_desc = - conv_desc.descriptor(paddings, strides, dilations); int input_channels = input->dims()[1]; - int input_height = input->dims()[2]; - int input_width = input->dims()[3]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + int output_grad_channels = filter->dims()[0]; - int output_grad_height = output_grad->dims()[2]; - int output_grad_width = output_grad->dims()[3]; + int output_grad_height, output_grad_width, output_grad_depth; + if (input->dims().size() == 5) { + output_grad_depth = output_grad->dims()[2]; + output_grad_height = output_grad->dims()[3]; + output_grad_width = output_grad->dims()[4]; + } else { + output_grad_depth = 1; + output_grad_height = output_grad->dims()[2]; + output_grad_width = output_grad->dims()[3]; + } - int group_offset_in = input_channels / groups * input_height * input_width; - int group_offset_out = - output_grad_channels / groups * output_grad_height * output_grad_width; + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; + int group_offset_out = output_grad_channels / groups * output_grad_height * + output_grad_width * output_grad_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn backward algorithm --------------------- cudnnConvolutionBwdDataAlgo_t data_algo; @@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel { auto handle = ctx.cuda_device_context().cudnn_handle(); if (input_grad) { - cudnn_input_grad_desc = input_grad_desc.descriptor( - layout, framework::vectorize2int(input_grad->dims()), groups); PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel { cudnn_output_grad_desc, cudnn_conv_desc, // dxDesc: Handle to the previously initialized output tensor // descriptor. - cudnn_input_grad_desc, + cudnn_input_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &data_algo)); PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, - cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size)); + cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } if (filter_grad) { - cudnn_filter_grad_desc = filter_grad_desc.descriptor( - layout, framework::vectorize2int(filter_grad->dims()), groups); PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, @@ -222,7 +275,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel { platform::GPUPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- - // FIXME(typhoonzero): template type T may not be the same as cudnn call. T alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); @@ -233,21 +285,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel { handle, &alpha, cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_input_grad_desc, input_grad_data + i * group_offset_in)); + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + i * group_offset_in)); } } // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. - for (int i = 0; i < groups; i++) { PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_grad_desc, + workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); } } @@ -259,8 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel, +REGISTER_OP_GPU_KERNEL(conv2d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); + +REGISTER_OP_GPU_KERNEL(conv3d_cudnn, + paddle::operators::CudnnConvOpKernel, paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv_cudnn_grad, +REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad, paddle::operators::CudnnConvGradOpKernel, paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h index c5d8a6066e..80a4c9bb4b 100644 --- a/paddle/platform/cudnn_helper.h +++ b/paddle/platform/cudnn_helper.h @@ -116,7 +116,7 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat( case DataLayout::kNCHW: return CUDNN_TENSOR_NCHW; case DataLayout::kNCDHW: - return CUDNN_TENSOR_NCHW; // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW + return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same default: PADDLE_THROW("Unknown cudnn equivalent for order"); } @@ -143,7 +143,7 @@ class ScopedTensorDescriptor { strides[i] = dims[i + 1] * strides[i + 1]; } // Update tensor descriptor dims setting if groups > 1 - // FIXME(typhoonzero): Assume using NCHW or NCDHW order + // NOTE: Assume using NCHW or NCDHW order std::vector dims_with_group(dims.begin(), dims.end()); // copy if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; @@ -186,7 +186,6 @@ class ScopedFilterDescriptor { // width of the filter. std::vector kernel_with_group(kernel.begin(), kernel.end()); if (groups > 1) { - // M /= groups kernel_with_group[0] /= groups; // NOTE: input filter(C) of the filter is already asserted to be C/groups. } diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 2240dc73cd..e82e3ab0c9 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -16,8 +16,8 @@ def conv2d_forward_naive(input, filter, group, conv_param): out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1] out = np.zeros((in_n, out_c, out_h, out_w)) - d_bolck_w = (dilation[0] * (f_h - 1) + 1) - d_bolck_h = (dilation[1] * (f_w - 1) + 1) + d_bolck_h = (dilation[0] * (f_h - 1) + 1) + d_bolck_w = (dilation[1] * (f_w - 1) + 1) input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )), mode='constant', @@ -167,27 +167,27 @@ class TestWithDilation(TestConv2dOp): #----------------Conv2dCudnn---------------- class TestCudnn(TestConv2dOp): def init_op_type(self): - self.op_type = "conv_cudnn" + self.op_type = "conv2d_cudnn" class TestCudnnWithPad(TestWithPad): def init_op_type(self): - self.op_type = "conv_cudnn" + self.op_type = "conv2d_cudnn" class TestCudnnWithStride(TestWithStride): def init_op_type(self): - self.op_type = "conv_cudnn" + self.op_type = "conv2d_cudnn" class TestCudnnWithGroup(TestWithGroup): def init_op_type(self): - self.op_type = "conv_cudnn" + self.op_type = "conv2d_cudnn" class TestCudnnWith1x1(TestWith1x1): def init_op_type(self): - self.op_type = "conv_cudnn" + self.op_type = "conv2d_cudnn" # cudnn v5 does not support dilation conv. diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py index 934ea46437..8593dff20b 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py @@ -169,5 +169,31 @@ class TestWithDilation(TestConv3dOp): self.groups = 3 +class TestCudnn(TestConv3dOp): + def init_op_type(self): + self.op_type = "conv3d_cudnn" + + +class TestWithGroup1Cudnn(TestWithGroup1): + def init_op_type(self): + self.op_type = "conv3d_cudnn" + + +class TestWithGroup2Cudnn(TestWithGroup2): + def init_op_type(self): + self.op_type = "conv3d_cudnn" + + +class TestWith1x1Cudnn(TestWith1x1): + def init_op_type(self): + self.op_type = "conv3d_cudnn" + + +# FIXME(typhoonzero): find a way to determine if +# using cudnn > 6 in python +# class TestWithDilationCudnn(TestWithDilation): +# def init_op_type(self): +# self.op_type = "conv3d_cudnn" + if __name__ == '__main__': unittest.main() -- GitLab