From ed2a1852486197e6c8df1a9bdc1bbe47bec40df6 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 24 Nov 2019 09:44:19 +0800 Subject: [PATCH] optimize nhwc for tensor core in ConvOp and ConvGradOp (#20597) --- paddle/fluid/operators/conv_cudnn_op.cu | 255 +++++++++++++----- paddle/fluid/operators/conv_op.cc | 14 +- paddle/fluid/operators/conv_op.h | 30 +++ paddle/fluid/platform/cudnn_desc.h | 48 +++- .../fluid/tests/unittests/test_conv2d_op.py | 57 +++- 5 files changed, 329 insertions(+), 75 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index c86c9a868f5..f6399ee0d3e 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -40,6 +40,10 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; +static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { + return dev_ctx.GetComputeCapability() >= 70; +} + template class CUDNNConvOpKernel : public framework::OpKernel { public: @@ -68,11 +72,27 @@ class CUDNNConvOpKernel : public framework::OpKernel { const std::string data_format = ctx.Attr("data_format"); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + auto dtype = platform::CudnnDataType::type; + + // Tensor Core introduced from Volta GPUs supports more faster conv op + // with FP16 in NHWC data format. + const bool compute_in_nhwc = + dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); + // We will only do data format conversion from NHWC to NCHW. + // cudnn will convert NCHW to NHWC automatically on Tensor Core. + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; + VLOG(3) << "Compute ConvOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); + // ------------ transformed tensor ----------- Tensor transformed_input_channel(input->type()); Tensor transformed_output(output->type()); + Tensor transformed_filter_channel(filter->type()); T* output_data = nullptr; - if (channel_last) { + if (channel_last && compute_format == DataLayout::kNCHW) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; ResizeToChannelFirst( ctx, input, &transformed_input_channel); TransToChannelFirst( @@ -82,19 +102,36 @@ class CUDNNConvOpKernel : public framework::OpKernel { &transformed_output); } else { - transformed_input_channel = *input; - transformed_output = *output; + transformed_input_channel.ShareDataWith(*input); + transformed_output.ShareDataWith(*output); + } + if (compute_format == DataLayout::kNHWC) { + VLOG(3) << "Transform filter tensor from NCHW to NHWC."; + ResizeToChannelLast( + ctx, filter, &transformed_filter_channel); + TransToChannelLast( + ctx, filter, &transformed_filter_channel); + } else { + transformed_filter_channel.ShareDataWith(*filter); } output_data = transformed_output.data(); // update padding and dilation auto in_dims = transformed_input_channel.dims(); - auto filter_dims = filter->dims(); + auto filter_dims = transformed_filter_channel.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + framework::DDim filter_data_dims; + + if (compute_format == DataLayout::kNCHW) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = + framework::slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = + framework::slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } - framework::DDim filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); @@ -108,17 +145,33 @@ class CUDNNConvOpKernel : public framework::OpKernel { std::vector padding_diff(data_dim); std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + + if (compute_format == DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + if (compute_format == DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } } framework::DDim new_input_shape( framework::make_ddim(new_input_shape_vec)); @@ -147,7 +200,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { } } else { - transformed_input = transformed_input_channel; + transformed_input.ShareDataWith(transformed_input_channel); if (paddings.size() == data_dim) { for (size_t i = 0; i < data_dim; ++i) { padding_common[i] = paddings[i]; @@ -160,18 +213,20 @@ class CUDNNConvOpKernel : public framework::OpKernel { } const T* input_data = transformed_input.data(); - const T* filter_data = filter->data(); + const T* filter_data = transformed_filter_channel.data(); // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, filter, &transformed_output, strides, - padding_common, dilations}; + ConvArgs args{&transformed_input, &transformed_filter_channel, + &transformed_output, strides, + padding_common, dilations}; auto handle = dev_ctx.cudnn_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto dtype = platform::CudnnDataType::type; - DataLayout layout = DataLayout::kNCHW; - if (transformed_input_channel.dims().size() == 5) { - layout = DataLayout::kNCDHW; + DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC + : DataLayout::kNCDHW; } auto layout_format = GetCudnnTensorFormat(layout); @@ -186,21 +241,27 @@ class CUDNNConvOpKernel : public framework::OpKernel { args.cdesc.desc(), groups)); groups = 1; #endif - args.idesc.set(transformed_input, groups); - - args.wdesc.set(*filter, layout_format, groups); - args.odesc.set(transformed_output, groups); + args.idesc.set(transformed_input, layout_format); + args.wdesc.set(transformed_filter_channel, layout_format, groups); + args.odesc.set(transformed_output, layout_format); int i_n, i_c, i_d, i_h, i_w; - - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); + + if (compute_format == DataLayout::kNHWC) { + GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, + &i_h, &i_w); + GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, + &o_h, &o_w); + } else { + GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, + &i_h, &i_w); + GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, + &o_h, &o_w); + } int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = filter->numel() / groups; + int group_offset_filter = transformed_filter_channel.numel() / groups; // ------------------- cudnn conv workspace --------------------- size_t workspace_size = 0; // final workspace to allocate. // ------------------- cudnn conv algorithm --------------------- @@ -225,7 +286,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { workspace_size); } - if (channel_last) { + if (channel_last && compute_format == DataLayout::kNCHW) { TransToChannelLast( ctx, &transformed_output, output); } @@ -245,7 +306,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - const T* filter_data = filter->data(); if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); } @@ -269,12 +329,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { const std::string data_format = ctx.Attr("data_format"); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + auto dtype = platform::CudnnDataType::type; + const bool compute_in_nhwc = + dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; + VLOG(3) << "Compute ConvGradOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); + // transform Tensor Tensor transformed_input_channel(input->type()); Tensor transformed_output_grad_channel(output_grad->type()); Tensor transformed_input_grad_channel(input->type()); + Tensor transformed_filter_channel(filter->type()); + Tensor transformed_filter_grad_channel(filter->type()); - if (channel_last) { + if (channel_last && compute_format == DataLayout::kNCHW) { + VLOG(3) << "Transform input, output_grad, input_grad and tensor from " + "NHWC to NCHW."; ResizeToChannelFirst( ctx, input, &transformed_input_channel); TransToChannelFirst( @@ -289,22 +362,46 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ResizeToChannelFirst( ctx, input_grad, &transformed_input_grad_channel); } - } else { - transformed_input_channel = *input; - transformed_output_grad_channel = *output_grad; + transformed_input_channel.ShareDataWith(*input); + transformed_output_grad_channel.ShareDataWith(*output_grad); if (input_grad) { transformed_input_grad_channel.ShareDataWith(*input_grad); } } + if (compute_format == DataLayout::kNHWC) { + VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; + ResizeToChannelLast( + ctx, filter, &transformed_filter_channel); + TransToChannelLast( + ctx, filter, &transformed_filter_channel); + + if (filter_grad) { + ResizeToChannelLast( + ctx, filter_grad, &transformed_filter_grad_channel); + } + } else { + transformed_filter_channel.ShareDataWith(*filter); + if (filter_grad) { + transformed_filter_grad_channel.ShareDataWith(*filter_grad); + } + } + // update paddings auto in_dims = transformed_input_channel.dims(); - auto filter_dims = filter->dims(); + auto filter_dims = transformed_filter_channel.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); + framework::DDim filter_data_dims; + if (compute_format == DataLayout::kNCHW) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = + framework::slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = + framework::slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); @@ -323,15 +420,30 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { std::vector padding_diff(data_dim); std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + if (compute_format == DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + if (compute_format == DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } } framework::DDim new_input_shape( framework::make_ddim(new_input_shape_vec)); @@ -384,42 +496,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { const T* input_data = transformed_input.data(); const T* output_grad_data = transformed_output_grad_channel.data(); + const T* filter_data = transformed_filter_channel.data(); T* filter_grad_data = nullptr; T* input_grad_data = nullptr; T* transformed_input_grad_data = nullptr; ConvArgs args1{&transformed_input_grad, - filter, + &transformed_filter_channel, &transformed_output_grad_channel, strides, padding_common, dilations}; ConvArgs args2{&transformed_input, - filter_grad, + &transformed_filter_grad_channel, &transformed_output_grad_channel, strides, padding_common, dilations}; auto handle = dev_ctx.cudnn_handle(); - auto dtype = platform::CudnnDataType::type; - DataLayout layout = DataLayout::kNCHW; - if (input->dims().size() == 5) { - layout = DataLayout::kNCDHW; + DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC + : DataLayout::kNCDHW; } auto layout_tensor = GetCudnnTensorFormat(layout); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); + if (compute_format == DataLayout::kNHWC) { + GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, + &i_h, &i_w); + GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, + &o_c, &o_d, &o_h, &o_w); + } else { + GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, + &i_h, &i_w); + GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, + &o_c, &o_d, &o_h, &o_w); + } int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = filter->numel() / groups; + int group_offset_filter = transformed_filter_channel.numel() / groups; // ------------------- cudnn backward algorithm --------------------- cudnnConvolutionBwdDataAlgo_t data_algo = static_cast(0); @@ -439,9 +560,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { input_grad_data = input_grad->data(); transformed_input_grad_data = transformed_input_grad.data(); args1.handle = handle; - args1.idesc.set(transformed_input_grad, iwo_groups); - args1.wdesc.set(*filter, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, iwo_groups); + args1.idesc.set(transformed_input_grad, layout_tensor); + args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); + args1.odesc.set(transformed_output_grad_channel, layout_tensor); args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups); using search1 = SearchAlgorithm; @@ -453,11 +574,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (filter_grad) { // ------------------- cudnn descriptors --------------------- - filter_grad_data = filter_grad->data(); + filter_grad_data = transformed_filter_grad_channel.data(); args2.handle = handle; - args2.idesc.set(transformed_input, iwo_groups); - args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); - args2.odesc.set(transformed_output_grad_channel, iwo_groups); + args2.idesc.set(transformed_input, layout_tensor); + args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, + iwo_groups); + args2.odesc.set(transformed_output_grad_channel, layout_tensor); args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups); using search2 = SearchAlgorithm; @@ -506,7 +628,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } } - if (channel_last) { + if (channel_last && compute_format == DataLayout::kNCHW) { TransToChannelLast( ctx, &transformed_input_grad_channel, input_grad); } @@ -527,6 +649,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { }, workspace_size); } + + if (compute_format == DataLayout::kNHWC) { + TransToChannelFirst( + ctx, &transformed_filter_grad_channel, filter_grad); + } } } }; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8a85ad83851..0a4ae0866b1 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -97,13 +97,15 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { filter_dims[0], filter_dims, groups); framework::DDim in_data_dims; + framework::DDim filter_data_dims; if (channel_last) { in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); } - framework::DDim filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); + + filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); @@ -117,9 +119,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { output_shape.push_back(-1); } else { - output_shape.push_back(ConvOutputSize(in_data_dims[i], filter_dims[i + 2], - dilations[i], paddings[2 * i], - paddings[2 * i + 1], strides[i])); + output_shape.push_back( + ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i], + paddings[2 * i], paddings[2 * i + 1], strides[i])); } } if (channel_last) { @@ -335,7 +337,7 @@ parameters is checked in the infer-shape. Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. -Filters(Input) is MCHW format. Where M is the number of output image channels, C is +Filters(Input) is MCHW format format. Where M is the number of output image channels, C is the number of input image channels, H is the height of the filter, and W is the width of the filter. Parameters(strides, paddings, dilations) are two elements. These two elements represent diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index bb6a805147c..1f765b66283 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -154,6 +154,36 @@ inline void ResizeToChannelFirst(const framework::ExecutionContext& context, } } +template +inline void ResizeToChannelLast(const framework::ExecutionContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = framework::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(framework::make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = framework::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(framework::make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + template inline void TransToChannelFirst(const framework::ExecutionContext& context, const Tensor* input, diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index f503dfb3878..4e0185183eb 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -34,6 +34,29 @@ inline cudnnDataType_t ToCudnnDataType(const T& t) { return ToCudnnDataType(type); } +inline std::vector TransformDimOrder(const std::vector& dims) { + std::vector transformed_dims(dims.begin(), dims.end()); + int H, W, D, C; + if (dims.size() == 4) { + H = dims[1]; + W = dims[2]; + C = dims[3]; + transformed_dims[1] = C; + transformed_dims[2] = H; + transformed_dims[3] = W; + } else { + D = dims[1]; + H = dims[2]; + W = dims[3]; + C = dims[4]; + transformed_dims[1] = C; + transformed_dims[2] = D; + transformed_dims[3] = H; + transformed_dims[4] = W; + } + return transformed_dims; +} + template <> inline cudnnDataType_t ToCudnnDataType( const framework::proto::VarType::Type& t) { @@ -117,6 +140,19 @@ class TensorDescriptor { dims_with_group.data(), strides.data())); } + void set(const Tensor& tensor, const cudnnTensorFormat_t format) { + auto dims = framework::vectorize(tensor.dims()); + std::vector transformed_dims; + if (format == CUDNN_TENSOR_NHWC) { + transformed_dims = TransformDimOrder(dims); + } else { + transformed_dims = dims; + } + CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptorEx( + desc_.get(), format, ToCudnnDataType(tensor.type()), + transformed_dims.size(), transformed_dims.data())); + } + private: std::unique_ptr desc_; }; @@ -143,12 +179,18 @@ class FilterDescriptor { void set(const Tensor& tensor, const cudnnTensorFormat_t format, const int groups = 1) { auto dims = framework::vectorize(tensor.dims()); + std::vector transformed_dims; + if (format == CUDNN_TENSOR_NHWC) { + transformed_dims = TransformDimOrder(dims); + } else { + transformed_dims = dims; + } if (groups > 1) { - dims[1] = dims[1] / groups; + transformed_dims[1] = transformed_dims[1] / groups; } CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor( - desc_.get(), ToCudnnDataType(tensor.type()), format, dims.size(), - dims.data())); + desc_.get(), ToCudnnDataType(tensor.type()), format, + transformed_dims.size(), transformed_dims.data())); } private: diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index a1a3cc9ba7a..c936016a80b 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -81,7 +81,6 @@ def conv2d_forward_naive(input, if len(pad) == 4: pad_h_0, pad_h_1 = pad[0], pad[1] pad_w_0, pad_w_1 = pad[2], pad[3] - out_h = 1 + (in_h + pad_h_0 + pad_h_1 - (dilation[0] * (f_h - 1) + 1)) // stride[0] out_w = 1 + (in_w + pad_w_0 + pad_w_1 - (dilation[1] * @@ -204,6 +203,50 @@ def create_test_cudnn_channel_last_class(parent): globals()[cls_name] = TestCudnnChannelLastCase +def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True): + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + class TestCudnnChannelLastFp16(parent): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter'])) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Filter'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Input'])) + + def init_data_format(self): + self.data_format = "NHWC" + + def init_test_case_2(self): + N, C, H, W = self.input_size + self.input_size = [N, H, W, C] + + cls_name = "{0}_{1}".format(parent.__name__, "CudnnChannelLastFp16") + TestCudnnChannelLastFp16.__name__ = cls_name + globals()[cls_name] = TestCudnnChannelLastFp16 + + def create_test_padding_SAME_class(parent): class TestPaddingSMAECase(parent): def init_paddings(self): @@ -699,7 +742,6 @@ class TestConv2dOp_v2(OpTest): self.init_dilation() self.init_data_format() self.init_test_case() - self.init_paddings() self.init_test_case_2() @@ -1195,6 +1237,17 @@ create_test_cudnn_channel_last_class(TestWithStride_AsyPadding) create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding) create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding) +create_test_cudnn_channel_last_fp16_class( + TestConv2dOp_AsyPadding, grad_check=False) +create_test_cudnn_channel_last_fp16_class( + TestWithPad_AsyPadding, grad_check=False) +create_test_cudnn_channel_last_fp16_class( + TestWithStride_AsyPadding, grad_check=False) +create_test_cudnn_channel_last_fp16_class( + TestWithGroup_AsyPadding, grad_check=False) +create_test_cudnn_channel_last_fp16_class( + TestWithDilation_AsyPadding, grad_check=False) + # --------- test python API --------------- class TestConv2dAPI(OpTest): -- GitLab