diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp index 5b735b9d81b4d66aeb87a27928c260c293766274..e4025e23f9bb1f6666b71db30873f1c143f59ca7 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp @@ -19,9 +19,6 @@ using namespace cuda; using namespace conv_bias; bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(const SizeArgs& args) const { - if (args.z_layout->ndim > 0) - return false; - if (args.filter_meta.format != Param::Format::NCHW && args.filter_meta.format != Param::Format::NHWC) { if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { @@ -75,6 +72,15 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle( sizes.push_back(dst_layout.span().dist_byte()); } + if (args.z_layout->ndim > 0 && + args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + auto z_layout = *args.z_layout; + z_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype); + sizes.push_back(z_layout.span().dist_byte()); + } + SizeArgs conv_args = args; conv_args.dst_layout = &dst_layout; @@ -129,6 +135,22 @@ void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { cudnnGetErrorString(status), conv_args.to_string().c_str()); } + if (args.z_layout->ndim > 0) { + auto z_tensor = *args.z_tensor; + if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + z_tensor.raw_ptr = bundle.get(2); + z_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, + z_tensor.layout.dtype); + auto typecvt = args.handle->create_operator(); + typecvt->exec(*args.z_tensor, z_tensor); + } + auto add = args.handle->create_operator(); + add->param().mode = Elemwise::Param::Mode::ADD; + add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor); + } + handle_bias_and_nonlinear( args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, args.bias_tensor); diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index 9478262b69506fea92d326b3b5950c5847cf7227..03c01c4a2fe87dc24c5b67e2edbba4e5cd851039 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -71,11 +71,12 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( return false; } +#if CUDNN_VERSION < 7605 if (args.src_layout->dtype.enumv() == DTypeEnum::Float16 && - args.dst_layout->dtype.enumv() == DTypeEnum::Float16 && - param.format == param::ConvBias::Format::NHWC) { + args.dst_layout->dtype.enumv() == DTypeEnum::Float16) { return false; } +#endif #if CUDNN_MAJOR < 8 if (m_cudnn_enum == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index dcd7b41b7ffbdb441d6caec217ea48bc02f6d7b2..e2120a18df16af1f0d614da463bd16e44bbd52bd 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -1293,6 +1293,56 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_TENSORCORE_INT8) { } } +TEST_F(CUDA, CONV_BIAS_ADD_Z_CUDNN_CONVOLUTION) { + using namespace conv_bias; + Checker checker(handle_cuda()); + + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ConvBiasForward::algo_name("CUDNN:Convolution", {}) + .c_str())); + + NormalRNG default_rng; + param::ConvBias param; + param.pad_h = param.pad_w = 1; + using Format = param::ConvBias::Format; + using NLMode = param::ConvBias::NonlineMode; + param.nonlineMode = NLMode::RELU; + auto c = [&](DType dt) { + param.format = Format::NCHW; + /// set epsilon to be 2e-3 to bypass low accuracy of winograd algorithm + float eps = 2e-3; + if (dt == dtype::Float16()) { + eps = 1e-2; + param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; + } + checker.set_dtype(0, dt) + .set_dtype(1, dt) + .set_dtype(2, dt) + .set_dtype(3, dt) + .set_dtype(4, dt) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_rng(2, &default_rng) + .set_rng(3, &default_rng) + .set_epsilon(eps) + .set_param(param) + .execs({{16, 256, 7, 7}, + {256, 256, 3, 3}, + {1, 256, 1, 1}, + {16, 256, 7, 7}, + {}}); + param.format = Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 256}, + {256, 3, 3, 256}, + {1, 1, 1, 256}, + {16, 7, 7, 256}, + {}}); + }; + c(dtype::Float32()); + c(dtype::Float16()); +} + #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_TENSORCORE_INT8) { require_compute_capability(7, 5);