diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp index fb89ec6e6effd83c668e662b6d6299868f560678..bcd3d6884a26ac5a6bd71f9dc7ae283f5ed1bcd8 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp @@ -245,9 +245,8 @@ std::pair get_tensor_alignment( int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * algo_param.threadblock_k / (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); - int threadblock_loads = filter.dtype.size( - algo_param.threadblock_m * algo_param.threadblock_n * - algo_param.threadblock_k); + int threadblock_loads = + filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k); int load_per_thread = threadblock_loads / threads; if (load_per_thread >= 16) alignment_filter = 16; diff --git a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp index 6ef571448bd4bcbec405190a9012c5dc180eafc4..48c64c58fddd9f7afbbc1bea81ca90feebdd8a72 100644 --- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp @@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( using Format = Param::Format; using Sparse = Param::Sparse; using Mode = Param::Mode; + using NonlineMode = Param::NonlineMode; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; RETURN_IF_FALSE( @@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( args.src_layout->dtype.enumv() == DTypeEnum::Float16 && args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && args.dst_layout->dtype.enumv() == DTypeEnum::Float16); + RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); RETURN_IF_FALSE( args.bias_layout->ndim <= 0 || (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && diff --git a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp index 9d5df71dada0c47e38faf669e756e8af2ad7504a..1e4fee11f8a05cf7ef57bb15740af1edb6244059 100644 --- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp @@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( #define RETURN_IF_FALSE(stmt_) \ if (!(stmt_)) \ return false; + RETURN_IF_FALSE(is_compute_capability_required(6, 1)); RETURN_IF_FALSE( args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); using Param = param::ConvBias; using Format = Param::Format; using Sparse = Param::Sparse; using Mode = Param::Mode; + using NonlineMode = Param::NonlineMode; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; RETURN_IF_FALSE( @@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( args.src_layout->dtype.enumv() == DTypeEnum::Float32 && args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && args.dst_layout->dtype.enumv() == DTypeEnum::Float32); + RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); RETURN_IF_FALSE( args.bias_layout->ndim <= 0 || (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && diff --git a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp index 7487f1708b70a1088e166a940c406d2ef8c166ba..17bc1a0dfb50cfddb39f11b251c189a7d471c008 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp @@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai #define RETURN_IF_FALSE(stmt_) \ if (!(stmt_)) \ return false; + RETURN_IF_FALSE(is_compute_capability_required(6, 1)); RETURN_IF_FALSE( args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); using Param = param::Convolution; diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp index d499556fae463cb4d3ea558a9850c34a9ac9441c..0ef36905807f060c011c59b4b9d58ae554265311 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING : cutlass::conv::SpecialOptimizeDesc::NONE; + int alignment_filter = 4; + constexpr int warp_size = 32; + int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n * + m_algo_param.threadblock_k / + (m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k); + int threadblock_loads = args.filter_layout->dtype.size( + m_algo_param.threadblock_m * m_algo_param.threadblock_k); + int load_per_thread = threadblock_loads / threads; + if (load_per_thread >= 16) + alignment_filter = 16; + else if (load_per_thread >= 8) + alignment_filter = 8; + megdnn_assert(load_per_thread >= 4); ConvolutionKey key{ cutlass::conv::Operator::kDgrad, NumericTypeID::kS8, @@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: m_algo_param.stage, special_optimization, 4, - 4, + alignment_filter, false}; return (void*)Singleton::get().operation_table.find_op(key); } diff --git a/dnn/test/cuda/chanwise_convolution.cpp b/dnn/test/cuda/chanwise_convolution.cpp index 2202ffd67529b8055fddf3b7f564a6f86749a8ba..d154a843fe4d80fa136c7003adaee07b7fe0ade7 100644 --- a/dnn/test/cuda/chanwise_convolution.cpp +++ b/dnn/test/cuda/chanwise_convolution.cpp @@ -20,6 +20,7 @@ #include "test/common/workspace_wrapper.h" #include "test/cuda/benchmark.h" #include "test/cuda/fixture.h" +#include "test/cuda/utils.h" #include #include @@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ + require_compute_capability(6, 1); \ check_chanwise( \ dtype::Float32(), dtype::Float32(), handle_cuda(), \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ + require_compute_capability(6, 1); \ check_chanwise( \ dtype::Float32(), dtype::Float32(), handle_cuda(), \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) // check both ioc16 and io16xc32 #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ + require_compute_capability(7, 0); \ check_chanwise( \ dtype::Float16(), dtype::Float16(), handle_cuda(), \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ + require_compute_capability(7, 0); \ check_chanwise( \ dtype::Float16(), dtype::Float16(), handle_cuda(), \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { bencher.proxy()->target_execution_policy.algo.reset(); param.compute_mode = param::Convolution::ComputeMode::FLOAT32; bencher.set_param(param); - auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; + auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS; printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " "float16: %.2fms %.2fGB/s " diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index e2120a18df16af1f0d614da463bd16e44bbd52bd..f08441e87ef0c3ca78cc9c64afa4d3bd956d31f9 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { ConvBiasForward::algo_name( "CUDA:GROUP_CONV", {}) .c_str(), - {{"CUDNN", {}}}})); + {{"DEFAULT:CUDNN", {}}}})); ConvBias::Param param; param.sparse = ConvBias::Param::Sparse::GROUP; param.nonlineMode = mode;