提交 722aecd4 编写于 作者: M Megvii Engine Team

feat(mgb): support fp16 nhwc backward

GitOrigin-RevId: 954ac6405a2e7b8b6719916c57e31a80f623b0c1
上级 7a9f2ed9
......@@ -14,6 +14,7 @@
#include "src/cuda/utils.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/conv_bias/helper.h"
using namespace megdnn;
using namespace cuda;
......@@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available(
CUDNNBwdDataDescs D;
if (!is_cudnn_supported(args.as_fwd_args()))
TensorLayout bias_layout, z_layout;
conv_bias::CanonizedFilterMeta meta;
meta.copy_from(args.filter_meta);
conv_bias::BiasForwardSizeArgs bias_args{args.handle,
args.grad_layout, args.filter_layout, &bias_layout,
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY,
};
if (!conv_bias::is_cudnn_supported(bias_args))
return false;
#if CUDNN_VERSION >= 7500
// As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
// TensorCore operations produces incorrect result. So we disable
// this algo. Please remove the following code, when
// nvidia has fixed this issue.
// incorrect case:
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
// dtype=float16
if (args.filter_meta.dtype == dtype::Float16()) {
const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1";
auto cmp_len = strlen(algo_1);
if (is_compute_capability_required(7, 0) &&
strncmp(name(), algo_1, cmp_len) == 0) {
return false;
}
}
#endif
auto& cudnn = args.handle->cudnn();
args.init_desc(D);
size_t workspace_size;
......
......@@ -14,6 +14,7 @@
#include "src/cuda/utils.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/conv_bias/helper.h"
using namespace megdnn;
using namespace cuda;
......@@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available(
auto& cudnn = args.handle->cudnn();
CUDNNBwdFilterDescs D;
if (!is_cudnn_supported(args.as_fwd_args()))
TensorLayout bias_layout, z_layout;
conv_bias::CanonizedFilterMeta meta;
meta.copy_from(args.grad_filter_meta);
conv_bias::BiasForwardSizeArgs bias_args{args.handle,
args.src_layout, args.grad_layout, &bias_layout,
&z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY,
};
if (!conv_bias::is_cudnn_supported(bias_args))
return false;
args.init_desc(D);
......
......@@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) {
args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) {
return false;
}
} else if (args.filter_meta.format != param::Convolution::Format::NCHW) {
} else if (args.filter_meta.format != param::Convolution::Format::NCHW &&
args.filter_meta.format != param::Convolution::Format::NHWC) {
return false;
}
auto& fm = args.filter_meta;
......
......@@ -284,6 +284,16 @@ std::vector<TestArg> convolution::get_args_cudnn_5_1_failures() {
return args;
}
std::vector<TestArg> convolution::get_args_cudnn_5_1_backward() {
std::vector<TestArg> args;
args.emplace_back(
param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2,
2, 2, 2},
TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2});
return args;
}
std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() {
std::vector<TestArg> args;
for (size_t ic_size : {8, 16}) {
......
......@@ -40,6 +40,7 @@ std::vector<TestArg> get_args_x86_direct_case_2();
std::vector<TestArg> get_args_fallback_templated_impl();
std::vector<TestArg> get_args_fallback_non_templated_impl();
std::vector<TestArg> get_args_cudnn_5_1_failures();
std::vector<TestArg> get_args_cudnn_5_1_backward();
std::vector<TestArg> get_args_x86_winograd_algorithm();
std::vector<TestArg> get_args_BRAIN_481();
std::vector<TestArg> get_args();
......
......@@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
}
}
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
// algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
// TensorCore operations produces incorrect result.
// Maybe nvidia has fixed this issue
// There is a test using incorrect case:
// inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
// dtype=float16
using namespace convolution;
std::vector<TestArg> args = get_args_cudnn_5_1_backward();
Checker<ConvolutionBackwardData> checker(handle_cuda());
NormalRNG default_rng;
for (auto&& arg : args) {
float scale =
128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
scale = std::max(scale, 1.f);
UniformFloatRNG rng(scale, 2 * scale);
arg.param.format = param::Convolution::Format::NHWC;
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
arg.filter = cvt_filter_nchw2nhwc(arg.filter);
auto src = TensorLayout(arg.src, dtype::Float32());
auto filter = TensorLayout(arg.filter, dtype::Float32());
TensorLayout dst;
{
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = arg.param;
opr->deduce_layout(src, filter, dst);
}
src.dtype = dst.dtype = filter.dtype = dtype::Float16();
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-2)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
src.dtype = dst.dtype = filter.dtype = dtype::Float32();
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-2)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
}
}
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
using namespace convolution;
std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
Checker<ConvolutionBackwardData> checker(handle_cuda());
NormalRNG default_rng;
for (auto&& arg : args) {
float scale =
64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
UniformFloatRNG rng(scale, 2 * scale);
arg.param.format = param::Convolution::Format::NHWC;
arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
arg.filter = cvt_filter_nchw2nhwc(arg.filter);
auto src = TensorLayout(arg.src, dtype::Float32());
auto filter = TensorLayout(arg.filter, dtype::Float32());
TensorLayout dst;
{
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = arg.param;
opr->deduce_layout(src, filter, dst);
}
src.dtype = dst.dtype = filter.dtype = dtype::Float16();
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-2)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
src.dtype = dst.dtype = filter.dtype = dtype::Float32();
arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-2)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
}
}
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
if (cuda::is_compute_capability_required(7, 0))
return;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册