From 722aecd437cc2e919b5fa0cade8585104ebd3979 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 18 Aug 2021 11:02:14 +0800 Subject: [PATCH] feat(mgb): support fp16 nhwc backward GitOrigin-RevId: 954ac6405a2e7b8b6719916c57e31a80f623b0c1 --- .../cuda/convolution/backward_data/cudnn.cpp | 28 +++---- .../convolution/backward_filter/cudnn.cpp | 10 ++- dnn/src/cuda/convolution/helper.cpp | 3 +- dnn/test/common/convolution.cpp | 10 +++ dnn/test/common/convolution.h | 1 + dnn/test/cuda/convolution.cpp | 81 +++++++++++++++++++ 6 files changed, 112 insertions(+), 21 deletions(-) diff --git a/dnn/src/cuda/convolution/backward_data/cudnn.cpp b/dnn/src/cuda/convolution/backward_data/cudnn.cpp index 821a84263..f5989683d 100644 --- a/dnn/src/cuda/convolution/backward_data/cudnn.cpp +++ b/dnn/src/cuda/convolution/backward_data/cudnn.cpp @@ -14,6 +14,7 @@ #include "src/cuda/utils.h" #include "src/cuda/cudnn_wrapper.h" #include "src/cuda/convolution/helper.h" +#include "src/cuda/conv_bias/helper.h" using namespace megdnn; using namespace cuda; @@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( CUDNNBwdDataDescs D; - if (!is_cudnn_supported(args.as_fwd_args())) + TensorLayout bias_layout, z_layout; + conv_bias::CanonizedFilterMeta meta; + meta.copy_from(args.filter_meta); + conv_bias::BiasForwardSizeArgs bias_args{args.handle, + args.grad_layout, args.filter_layout, &bias_layout, + &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, + }; + if (!conv_bias::is_cudnn_supported(bias_args)) return false; -#if CUDNN_VERSION >= 7500 - // As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with - // TensorCore operations produces incorrect result. So we disable - // this algo. Please remove the following code, when - // nvidia has fixed this issue. - // incorrect case: - // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, - // dtype=float16 - if (args.filter_meta.dtype == dtype::Float16()) { - const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; - auto cmp_len = strlen(algo_1); - if (is_compute_capability_required(7, 0) && - strncmp(name(), algo_1, cmp_len) == 0) { - return false; - } - } -#endif - auto& cudnn = args.handle->cudnn(); args.init_desc(D); size_t workspace_size; diff --git a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp index 07addd6c2..dd6b8b25c 100644 --- a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp +++ b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp @@ -14,6 +14,7 @@ #include "src/cuda/utils.h" #include "src/cuda/cudnn_wrapper.h" #include "src/cuda/convolution/helper.h" +#include "src/cuda/conv_bias/helper.h" using namespace megdnn; using namespace cuda; @@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( auto& cudnn = args.handle->cudnn(); CUDNNBwdFilterDescs D; - if (!is_cudnn_supported(args.as_fwd_args())) + TensorLayout bias_layout, z_layout; + conv_bias::CanonizedFilterMeta meta; + meta.copy_from(args.grad_filter_meta); + conv_bias::BiasForwardSizeArgs bias_args{args.handle, + args.src_layout, args.grad_layout, &bias_layout, + &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, + }; + if (!conv_bias::is_cudnn_supported(bias_args)) return false; args.init_desc(D); diff --git a/dnn/src/cuda/convolution/helper.cpp b/dnn/src/cuda/convolution/helper.cpp index 2cfb99420..747e90b34 100644 --- a/dnn/src/cuda/convolution/helper.cpp +++ b/dnn/src/cuda/convolution/helper.cpp @@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { return false; } - } else if (args.filter_meta.format != param::Convolution::Format::NCHW) { + } else if (args.filter_meta.format != param::Convolution::Format::NCHW && + args.filter_meta.format != param::Convolution::Format::NHWC) { return false; } auto& fm = args.filter_meta; diff --git a/dnn/test/common/convolution.cpp b/dnn/test/common/convolution.cpp index 57a711c58..8b228112b 100644 --- a/dnn/test/common/convolution.cpp +++ b/dnn/test/common/convolution.cpp @@ -284,6 +284,16 @@ std::vector convolution::get_args_cudnn_5_1_failures() { return args; } +std::vector convolution::get_args_cudnn_5_1_backward() { + std::vector args; + args.emplace_back( + param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2, + 2, 2, 2}, + TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2}); + + return args; +} + std::vector convolution::get_args_x86_winograd_algorithm() { std::vector args; for (size_t ic_size : {8, 16}) { diff --git a/dnn/test/common/convolution.h b/dnn/test/common/convolution.h index f4e10b383..72ae3c647 100644 --- a/dnn/test/common/convolution.h +++ b/dnn/test/common/convolution.h @@ -40,6 +40,7 @@ std::vector get_args_x86_direct_case_2(); std::vector get_args_fallback_templated_impl(); std::vector get_args_fallback_non_templated_impl(); std::vector get_args_cudnn_5_1_failures(); +std::vector get_args_cudnn_5_1_backward(); std::vector get_args_x86_winograd_algorithm(); std::vector get_args_BRAIN_481(); std::vector get_args(); diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index c88d1ac60..0fa437db0 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { } } +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) { + // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with + // TensorCore operations produces incorrect result. + // Maybe nvidia has fixed this issue + // There is a test using incorrect case: + // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, + // dtype=float16 + using namespace convolution; + std::vector args = get_args_cudnn_5_1_backward(); + Checker checker(handle_cuda()); + NormalRNG default_rng; + for (auto&& arg : args) { + float scale = + 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); + scale = std::max(scale, 1.f); + UniformFloatRNG rng(scale, 2 * scale); + arg.param.format = param::Convolution::Format::NHWC; + arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); + arg.filter = cvt_filter_nchw2nhwc(arg.filter); + auto src = TensorLayout(arg.src, dtype::Float32()); + auto filter = TensorLayout(arg.filter, dtype::Float32()); + TensorLayout dst; + { + auto opr = handle_cuda()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout(src, filter, dst); + } + src.dtype = dst.dtype = filter.dtype = dtype::Float16(); + arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-2) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + src.dtype = dst.dtype = filter.dtype = dtype::Float32(); + arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-2) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + } +} + +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) { + using namespace convolution; + std::vector args = get_args_cuda_conv_bwd_data(); + Checker checker(handle_cuda()); + NormalRNG default_rng; + for (auto&& arg : args) { + float scale = + 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); + UniformFloatRNG rng(scale, 2 * scale); + arg.param.format = param::Convolution::Format::NHWC; + arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); + arg.filter = cvt_filter_nchw2nhwc(arg.filter); + auto src = TensorLayout(arg.src, dtype::Float32()); + auto filter = TensorLayout(arg.filter, dtype::Float32()); + TensorLayout dst; + { + auto opr = handle_cuda()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout(src, filter, dst); + } + src.dtype = dst.dtype = filter.dtype = dtype::Float16(); + arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-2) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + src.dtype = dst.dtype = filter.dtype = dtype::Float32(); + arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-2) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + } +} + TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { if (cuda::is_compute_capability_required(7, 0)) return; -- GitLab