diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index 406c4ea59586b2f1cf3d86bd27ec1647b6ef0e19..b319226efcec55d19a671cf20440a46959c21322 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -511,6 +511,12 @@ protected: const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, size_t workspace_in_bytes, const PreprocessedFilter* preprocessed_filter); + + CanonizedFilterMeta check_exec_allow_noncontiguous( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter); }; using ConvBias = ConvBiasForward; diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp index 296e88762ec7f21f333a437341653ca9e5231efe..76ef5c9602220167d7133ab49b7803138b49cb13 100644 --- a/dnn/src/common/conv_bias.cpp +++ b/dnn/src/common/conv_bias.cpp @@ -11,30 +11,18 @@ */ #include "src/common/conv_bias.h" -#include "megdnn/oprs/nn.h" #include "src/common/utils.h" #include "src/common/opr_delegate.h" namespace megdnn { +namespace { -void ConvBiasForward::deduce_dtype(DType src, DType filter, DType /* bias */, - DType /* z */, DType& dst) { - check_or_deduce_dtype_fwd(src, filter, dst); -} - -void ConvBiasForward::deduce_layout(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& /* bias */, - const TensorLayout& /* z */, - TensorLayout& dst) { - deduce_layout_fwd(src, filter, dst); -} - -ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( - const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& bias, const TensorLayout& z, - const TensorLayout& dst, size_t workspace_in_bytes, - const PreprocessedFilter* preprocessed_filter) { +void do_check_exec_common( + ConvBiasForward* opr, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& bias, + const TensorLayout& z, const TensorLayout& dst, + size_t workspace_in_bytes, + const ConvBiasForward::PreprocessedFilter* preprocessed_filter) { megdnn_assert((src.dtype.enumv() == filter.dtype.enumv()) || (src.dtype.enumv() == DTypeEnum::Quantized4Asymm && filter.dtype.enumv() == DTypeEnum::QuantizedS4)); @@ -52,9 +40,8 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( } } - auto ret = check_layout_fwd(src, filter, dst); megdnn_assert_contiguous(bias); - auto required_workspace_in_bytes = get_workspace_in_bytes( + auto required_workspace_in_bytes = opr->get_workspace_in_bytes( src, filter, bias, z, dst, preprocessed_filter); megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes, "worksapce have size of %zu, but need %zu", @@ -68,55 +55,58 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( return bias.eq_layout(dst); } }; - if (check_eq(bias, dst)) - return ret; - if (param().format == param::ConvBias::Format::NCHW || - param().format == param::ConvBias::Format::NCHW4_NCHW) { + if (check_eq(bias, dst)) { + return; + } + if (opr->param().format == param::ConvBias::Format::NCHW || + opr->param().format == param::ConvBias::Format::NCHW4_NCHW) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == 1); - } else if (param().format == param::ConvBias::Format::NHWC) { + } else if (opr->param().format == param::ConvBias::Format::NHWC) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == 1); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == dst.shape[3], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); - } else if (param().format == param::ConvBias::Format::NCHW4 || - param().format == param::ConvBias::Format::NCHW44 || - param().format == param::ConvBias::Format::NCHW44_DOT || - param().format == param::ConvBias::Format::NCHW32_NCHW4) { + } else if (opr->param().format == param::ConvBias::Format::NCHW4 || + opr->param().format == param::ConvBias::Format::NCHW44 || + opr->param().format == param::ConvBias::Format::NCHW44_DOT || + opr->param().format == + param::ConvBias::Format::NCHW32_NCHW4) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == 1); megdnn_assert(bias.shape[4] == 4); - } else if (param().format == param::ConvBias::Format::NCHW8 || - param().format == param::ConvBias::Format::NCHW88 ) { + } else if (opr->param().format == param::ConvBias::Format::NCHW8 || + opr->param().format == param::ConvBias::Format::NCHW88) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == 1); megdnn_assert(bias.shape[4] == 8); - } else if (param().format == param::ConvBias::Format::NCHW32 || - param().format == param::ConvBias::Format::NCHW4_NCHW32) { + } else if (opr->param().format == param::ConvBias::Format::NCHW32 || + opr->param().format == + param::ConvBias::Format::NCHW4_NCHW32) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == 1); megdnn_assert(bias.shape[4] == 32); - } else if (param().format == param::ConvBias::Format::CHWN4) { + } else if (opr->param().format == param::ConvBias::Format::CHWN4) { megdnn_assert(bias.shape[0] == dst.shape[0], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); megdnn_assert(bias.shape[1] == 1); megdnn_assert(bias.shape[2] == 1); megdnn_assert(bias.shape[3] == 1); megdnn_assert(bias.shape[4] == 4); - } else if (param().format == param::ConvBias::Format::NCHW64) { + } else if (opr->param().format == param::ConvBias::Format::NCHW64) { megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", bias.to_string().c_str(), dst.to_string().c_str()); @@ -124,7 +114,8 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( megdnn_assert(bias.shape[3] == 1); megdnn_assert(bias.shape[4] == 64); } else { - megdnn_assert(param().format == param::ConvBias::Format::NHWCD4); + megdnn_assert(opr->param().format == + param::ConvBias::Format::NHWCD4); megdnn_assert(bias.shape[0] == 1); megdnn_assert(bias.shape[1] == 1); megdnn_assert(bias.shape[2] == dst.shape[2], "bias:%s, dst:%s", @@ -135,11 +126,53 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( } if (z.ndim != 0) { - megdnn_assert(param().format != param::ConvBias::Format::NCHW4_NCHW32); - megdnn_assert(param().format != param::ConvBias::Format::NCHW32_NCHW4); + megdnn_assert(opr->param().format != + param::ConvBias::Format::NCHW4_NCHW32); + megdnn_assert(opr->param().format != + param::ConvBias::Format::NCHW32_NCHW4); megdnn_assert(z.dtype.enumv() == dst.dtype.enumv()); megdnn_assert(z.eq_shape(dst)); } +} + +} // namespace + +void ConvBiasForward::deduce_dtype(DType src, DType filter, DType /* bias */, + DType /* z */, DType& dst) { + check_or_deduce_dtype_fwd(src, filter, dst); +} + +void ConvBiasForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& /* bias */, + const TensorLayout& /* z */, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); +} + +ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter) { + do_check_exec_common(this, src, filter, bias, z, dst, workspace_in_bytes, + preprocessed_filter); + auto ret = check_layout_fwd(src, filter, dst); + return ret; +} + +ConvBiasForward::CanonizedFilterMeta +ConvBiasForward::check_exec_allow_noncontiguous( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter) { + do_check_exec_common(this, src, filter, bias, z, dst, workspace_in_bytes, + preprocessed_filter); + TensorLayout dst_expected; + dst_expected.dtype = dst.dtype; + auto ret = deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); return ret; } diff --git a/dnn/src/common/conv_bias.h b/dnn/src/common/conv_bias.h index 3a55afe418008462debc5b06ac6625008404a248..84489c87c98493ea6814bf734db00c5b2845a850 100644 --- a/dnn/src/common/conv_bias.h +++ b/dnn/src/common/conv_bias.h @@ -12,6 +12,7 @@ #include "megdnn/handle.h" #include "megdnn/opr_param_defs.h" #include "megdnn/oprs/general.h" +#include "megdnn/oprs/nn.h" #include "megdnn/oprs/nn_int.h" #include "src/common/utils.h" diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 0730b2bf82645988583357cbe4c0c05dafd4c785..1474f7c73a29cf4faea21e96fa887f9a90a99309 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -595,8 +595,6 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst) const { auto errmsg = [&]() { return get_errmsg(src, filter, dst, param()); }; MEGDNN_MARK_USED_VAR(errmsg); - megdnn_assert_contiguous(src); - megdnn_assert_contiguous(filter); megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); megdnn_assert(((src.dtype.enumv() == filter.dtype.enumv()) || (src.dtype.enumv() == DTypeEnum::Quantized4Asymm && @@ -976,6 +974,8 @@ ConvolutionBase::CanonizedFilterMeta ConvolutionBase::check_layout_fwd( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) const { + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); TensorLayout dst_expected; dst_expected.dtype = dst.dtype; @@ -989,6 +989,8 @@ ConvolutionBase::CanonizedFilterMeta ConvolutionBase::check_layout_fwd( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) const { + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); TensorLayout dst_expected; dst_expected.dtype = dst.dtype; @@ -1002,6 +1004,8 @@ ConvolutionBase::CanonizedFilterMeta ConvolutionBase::check_layout_fwd( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) const { + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); TensorLayout dst_expected; dst_expected.dtype = dst.dtype; diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index a1baa9a3460f20099f3ddc8a98252e10eada50a2..a212ee4c255d6dbc0f6ae8dddf8a31bd3c5e5413 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -116,8 +116,9 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) - : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, - z, dst, preprocessed_filter) {} + : SizeArgs(o, src, filter, + o->make_canonized_filter_meta(src.ndim, filter), bias, z, + dst, preprocessed_filter) {} ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( ConvBiasForwardImpl* o, const TensorLayout& src, diff --git a/dnn/src/cuda/conv_bias/batched_matmul.cpp b/dnn/src/cuda/conv_bias/batched_matmul.cpp index 08f53cc8504e5e88b929d2b6c0229e1b116fff63..398a5f627bd20d32b48e23e3eda99aa755780388 100644 --- a/dnn/src/cuda/conv_bias/batched_matmul.cpp +++ b/dnn/src/cuda/conv_bias/batched_matmul.cpp @@ -75,8 +75,8 @@ ConvBiasForwardImpl::AlgoBatchedMatmul::get_subopr_list( const TensorLayoutArray& layouts, const OperatorBase* opr) const { const ConvBiasForwardImpl* conv_bias_opr = static_cast(opr); - CanonizedFilterMeta fm = - conv_bias_opr->check_layout_fwd(layouts[0], layouts[1], layouts[4]); + CanonizedFilterMeta fm = conv_bias_opr->make_canonized_filter_meta( + layouts[0].ndim, layouts[1]); auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[4], conv_bias_opr); diff --git a/dnn/src/cuda/conv_bias/chanwise.cpp b/dnn/src/cuda/conv_bias/chanwise.cpp index c4b946aeea3d7a5f7daee4bb86cef568dd5aa180..77ce45904edf4e955a9dd5605938e1685b64f5e2 100644 --- a/dnn/src/cuda/conv_bias/chanwise.cpp +++ b/dnn/src/cuda/conv_bias/chanwise.cpp @@ -20,6 +20,10 @@ using namespace conv_bias; bool ConvBiasForwardImpl::AlgoChanwise::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.src_layout->dtype == args.filter_layout->dtype && args.src_layout->dtype == dtype::BFloat16()) { return false; diff --git a/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp index 1fa875b02f1df1baba6d65963897de0e094713b8..178e78e77abd6c21642e31169545fa27c37771f0 100644 --- a/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp +++ b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp @@ -21,6 +21,10 @@ using namespace conv_bias; bool ConvBiasForwardImpl::AlgoChanwise8x8x32::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.z_layout->ndim > 0) return false; using NonlineMode = param::ConvBias::NonlineMode; diff --git a/dnn/src/cuda/conv_bias/chanwise_small.cpp b/dnn/src/cuda/conv_bias/chanwise_small.cpp index afe6adbbe07162864388ab381fe591d4d78fe610..a87aca06aaf9ccd69491a410ab0e5f5946353d29 100644 --- a/dnn/src/cuda/conv_bias/chanwise_small.cpp +++ b/dnn/src/cuda/conv_bias/chanwise_small.cpp @@ -30,6 +30,10 @@ inline bool is_available_small(const chanwise::Param& param) { bool ConvBiasForwardImpl::AlgoChanwiseSmall::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.src_layout->dtype == args.filter_layout->dtype && args.src_layout->dtype == dtype::BFloat16()) { return false; diff --git a/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp b/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp index fb5202c057dfd71e2619ed4f6e0af72bcf605764..2520cc70663280ab1297eb27965463af748c5353 100644 --- a/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp +++ b/dnn/src/cuda/conv_bias/conv_nchwqs8.cpp @@ -63,6 +63,10 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout( bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } auto&& param = args.opr->param(); bool is_format_ok = param.format == param::ConvBias::Format::NCHW; bool is_version_ok = CUDNN_VERSION >= 7500; diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index f0f6737ead8b29a68c9666ea068cff5170b56fe7..590bc8c379d28144efc3897c0a5faca2df457412 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -24,6 +24,10 @@ using namespace conv_bias; bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp index b6c49577e43e3b8ddda406a992ef3ac376984275..d0c332a404653e0acbbe1213151fefcb8c9c8e87 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp @@ -74,6 +74,10 @@ void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, bool ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp index f4cc3f9f8ad7d9d6c697b1d17b31179d05baee70..97abc14ab83b2f7fe791ad8da00b1dedc26af7f8 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp @@ -62,6 +62,10 @@ void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp index 240122c11acfc50139d921326c5349f72eff3869..687dbbd90d4f02040794d33cd7f29c0be5b95d04 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp @@ -109,6 +109,10 @@ INST(PerChannelBiasVisitor); bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter:: is_available(const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp index 98d00829101daebf93d2b9e89c13fceaac4a9c50..a61e89cb4779fa9d7e62f56987e62c9200d297a8 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp @@ -109,6 +109,10 @@ INST(PerChannelBiasVisitor); bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth:: is_available(const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 3e674080dabc2e76dba71f0b6497f7e10793453a..03cb358b08739a89694bbb14eced182eb6112fb6 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -23,6 +23,10 @@ using namespace convolution; #if CUDA_VERSION >= 10020 bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index b38d607ad37ea39524ea9233c6b60c341c1a591d..3b7e2e7042481a053a1bda8d90e268599da8dd44 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -20,6 +20,10 @@ using namespace cuda; bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp index d7818c598318d7f9f2ab380bfba3573e1a0e6b95..f1aaa2fbcb15805fe1c2307a404d9051e3716a11 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp @@ -20,6 +20,10 @@ using namespace cuda; #if CUDA_VERSION >= 10000 bool ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.bias_layout->ndim <= 0) return false; diff --git a/dnn/src/cuda/conv_bias/matmul.cpp b/dnn/src/cuda/conv_bias/matmul.cpp index 709dc1131476d3b4c6ddb226e2d12f7972d2d79c..a28ab6fd6612054737eb3ebd43bdd3053f3e5417 100644 --- a/dnn/src/cuda/conv_bias/matmul.cpp +++ b/dnn/src/cuda/conv_bias/matmul.cpp @@ -61,8 +61,8 @@ ConvBiasForwardImpl::AlgoMatmul::get_subopr_list( const TensorLayoutArray& layouts, const OperatorBase* opr) const { const ConvBiasForwardImpl* conv_bias_opr = static_cast(opr); - CanonizedFilterMeta fm = - conv_bias_opr->check_layout_fwd(layouts[0], layouts[1], layouts[4]); + CanonizedFilterMeta fm = conv_bias_opr->make_canonized_filter_meta( + layouts[0].ndim, layouts[1]); auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[4], conv_bias_opr); diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index b6ed25867fadc879cdf725565f901caa9ae69a9d..373d6392c60ca18bcd5c02cba40e7421d027e2a7 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -16,6 +16,7 @@ #include "src/cuda/handle.h" #include "src/cuda/utils.h" +#include "src/common/conv_bias.h" #include "src/common/algo_chooser.h" #include "src/cuda/cudnn_with_check.h" @@ -28,8 +29,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { - check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, - workspace.size, preprocessed_filter); + check_exec_allow_noncontiguous(src.layout, filter.layout, bias.layout, + z.layout, dst.layout, workspace.size, + preprocessed_filter); AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, preprocessed_filter); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 6b292778ee7fe7f4a2294ed1e5cac046af23f94f..07eaefd1a78ee8859a7d58dae16abe39d24416df 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -87,6 +87,7 @@ public: const AlgoAttribute& negative_attr) override; private: + static AlgoPack sm_algo_pack; }; diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp index 4a464fa1672247c6988b8255f2f02a59340d82da..5f70adb292bff3ec885bde90cecb56be30024e3f 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp @@ -25,6 +25,10 @@ using namespace activation_u4; #if CUDA_VERSION >= 10000 bool ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::is_available( const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || + !args.dst_layout->is_contiguous()) { + return false; + } if (args.z_layout->ndim > 0) return false; diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 20fa04e4bca3542e4fb93576062810356048b81b..649b5dcae4a8ab3d040bb13d322c2dd237e5b8f7 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -233,9 +233,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, dt_byte* workspace_ptr = workspace.raw_ptr; // ============================w * f + b================================ - auto filter_meta = - check_exec(src.layout, filter.layout, bias.layout, z.layout, - dst.layout, workspace.size, preprocessed_filter); + auto filter_meta = check_exec_allow_noncontiguous( + src.layout, filter.layout, bias.layout, z.layout, dst.layout, + workspace.size, preprocessed_filter); auto sfb = dst; if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { // intermediate result diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index 30fe47b27a862d01a70d6997e1247e49917234cf..a7db681101f84fcc4b8503331c37fdfc048461b1 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -749,6 +749,18 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_CUDNN_CONVOLUTION) { .set_param(arg.param) .execs({arg.src, arg.filter, arg.bias, {}, {}}); } + //! noncontiguous case + { + param::ConvBias param; + param.pad_h = param.pad_w = 1; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_INPLACE_MATMUL) { @@ -791,6 +803,18 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_INPLACE_MATMUL) { .execs({{2, 3, 3, 16}, {5, 3, 3, 3}, {1, 5, 1, 1}, {}, {}}) .execs({{2, 2, 8, 3}, {3, 2, 3, 3}, {1, 3, 1, 1}, {}, {}}); } + //! noncontiguous case + { + param::ConvBias param; + param.pad_h = param.pad_w = 1; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL) { @@ -835,6 +859,18 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL) { .execs({{2, 3, 3, 16}, {5, 3, 3, 3}, {1, 5, 1, 1}, {}, {}}) .execs({{2, 2, 8, 3}, {3, 2, 3, 3}, {1, 3, 1, 1}, {}, {}}); } + //! noncontiguous case + { + param::ConvBias param; + param.pad_h = param.pad_w = 1; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_8x8x32) { @@ -880,6 +916,21 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_8x8x32) { .execs({{2, 3, 16, 3}, {5, 3, 3, 3}, {1, 1, 1, 5}, {}, {}}) .execs({{2, 8, 3, 2}, {3, 3, 3, 2}, {1, 1, 1, 3}, {}, {}}); } + //! noncontiguous case + { + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 7, 7, 16}, {1568, 224, 32, 1}, dtype::QuantizedS8{1.2f}}, + {{16, 3, 3, 16}, {144, 48, 16, 1}, dtype::QuantizedS8{1.3f}}, + {{}, {}, dtype::QuantizedS32{1.2f * 1.3f}}, + {{}, {}, dtype::QuantizedS8{1.1f}}, + {{2, 7, 7, 16}, + {1568, 224, 32, 1}, + dtype::QuantizedS32{1.2f * 1.3f}}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) { @@ -913,6 +964,21 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) { checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {1, 4, 1, 1, 4}, {}, {}}); checker.exec( {{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {1, 64, 1, 1, 4}, {}, {}}); + //! noncontiguous case + { + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.format = ConvBias::Param::Format::NCHW4; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 4, 7, 7, 4}, {1568, 196, 28, 4, 1}, dtype::QuantizedS8{1.2f}}, + {{16, 4, 3, 3, 4}, {144, 36, 12, 4, 1}, dtype::QuantizedS8{1.3f}}, + {{}, {}, dtype::QuantizedS32{1.2f * 1.3f}}, + {{}, {}, dtype::QuantizedS8{1.1f}}, + {{2, 4, 7, 7, 4}, + {1568, 196, 28, 4, 1}, + dtype::QuantizedS32{1.2f * 1.3f}}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_BATCHED_MATMUL) { @@ -939,6 +1005,17 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_BATCHED_MATMUL) { checker.set_param(arg.param); checker.execs({arg.src, arg.filter, arg.bias, {}, {}}); } + //! noncontiguous case + { + param::ConvBias param; + checker.set_param(param).execl(TensorLayoutArray{ + {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()}, + {{16, 16, 1, 1}, {16, 1, 1, 1}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{}, {}, dtype::Float32()}, + {{2, 16, 7, 7}, {784, 49, 7, 1}, dtype::Float32()}, + }); + } } TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {