From 6cefabe734e26a3be4edd8a5143addd8b21ad10e Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 11 Feb 2022 17:38:00 +0800 Subject: [PATCH] fix(dnn/cuda): fix ci GitOrigin-RevId: 8267e5f9ddd5c6813fcfccf8df197f3c8112fa98 --- .../cuda/conv_bias/cutlass_convolution_base.cpp | 5 ++--- .../implicit_batched_gemm_float16_nchw_hmma.cpp | 2 ++ .../implicit_batched_gemm_float32_nchw_fma.cpp | 3 +++ .../implicit_batched_gemm_float32_nchw_fma.cpp | 1 + .../implicit_gemm_int8_nchw4_dp4a.cpp | 15 ++++++++++++++- dnn/test/cuda/chanwise_convolution.cpp | 7 ++++++- dnn/test/cuda/conv_bias.cpp | 2 +- 7 files changed, 29 insertions(+), 6 deletions(-) diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp index fb89ec6e6..bcd3d6884 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp @@ -245,9 +245,8 @@ std::pair get_tensor_alignment( int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * algo_param.threadblock_k / (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); - int threadblock_loads = filter.dtype.size( - algo_param.threadblock_m * algo_param.threadblock_n * - algo_param.threadblock_k); + int threadblock_loads = + filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k); int load_per_thread = threadblock_loads / threads; if (load_per_thread >= 16) alignment_filter = 16; diff --git a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp index 6ef571448..48c64c58f 100644 --- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp @@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( using Format = Param::Format; using Sparse = Param::Sparse; using Mode = Param::Mode; + using NonlineMode = Param::NonlineMode; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; RETURN_IF_FALSE( @@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( args.src_layout->dtype.enumv() == DTypeEnum::Float16 && args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && args.dst_layout->dtype.enumv() == DTypeEnum::Float16); + RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); RETURN_IF_FALSE( args.bias_layout->ndim <= 0 || (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && diff --git a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp index 9d5df71da..1e4fee11f 100644 --- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp @@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( #define RETURN_IF_FALSE(stmt_) \ if (!(stmt_)) \ return false; + RETURN_IF_FALSE(is_compute_capability_required(6, 1)); RETURN_IF_FALSE( args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); using Param = param::ConvBias; using Format = Param::Format; using Sparse = Param::Sparse; using Mode = Param::Mode; + using NonlineMode = Param::NonlineMode; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; RETURN_IF_FALSE( @@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( args.src_layout->dtype.enumv() == DTypeEnum::Float32 && args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && args.dst_layout->dtype.enumv() == DTypeEnum::Float32); + RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); RETURN_IF_FALSE( args.bias_layout->ndim <= 0 || (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && diff --git a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp index 7487f1708..17bc1a0df 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp @@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai #define RETURN_IF_FALSE(stmt_) \ if (!(stmt_)) \ return false; + RETURN_IF_FALSE(is_compute_capability_required(6, 1)); RETURN_IF_FALSE( args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); using Param = param::Convolution; diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp index d499556fa..0ef369058 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING : cutlass::conv::SpecialOptimizeDesc::NONE; + int alignment_filter = 4; + constexpr int warp_size = 32; + int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n * + m_algo_param.threadblock_k / + (m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k); + int threadblock_loads = args.filter_layout->dtype.size( + m_algo_param.threadblock_m * m_algo_param.threadblock_k); + int load_per_thread = threadblock_loads / threads; + if (load_per_thread >= 16) + alignment_filter = 16; + else if (load_per_thread >= 8) + alignment_filter = 8; + megdnn_assert(load_per_thread >= 4); ConvolutionKey key{ cutlass::conv::Operator::kDgrad, NumericTypeID::kS8, @@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: m_algo_param.stage, special_optimization, 4, - 4, + alignment_filter, false}; return (void*)Singleton::get().operation_table.find_op(key); } diff --git a/dnn/test/cuda/chanwise_convolution.cpp b/dnn/test/cuda/chanwise_convolution.cpp index 2202ffd67..d154a843f 100644 --- a/dnn/test/cuda/chanwise_convolution.cpp +++ b/dnn/test/cuda/chanwise_convolution.cpp @@ -20,6 +20,7 @@ #include "test/common/workspace_wrapper.h" #include "test/cuda/benchmark.h" #include "test/cuda/fixture.h" +#include "test/cuda/utils.h" #include #include @@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ + require_compute_capability(6, 1); \ check_chanwise( \ dtype::Float32(), dtype::Float32(), handle_cuda(), \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ + require_compute_capability(6, 1); \ check_chanwise( \ dtype::Float32(), dtype::Float32(), handle_cuda(), \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) // check both ioc16 and io16xc32 #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ + require_compute_capability(7, 0); \ check_chanwise( \ dtype::Float16(), dtype::Float16(), handle_cuda(), \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ + require_compute_capability(7, 0); \ check_chanwise( \ dtype::Float16(), dtype::Float16(), handle_cuda(), \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ @@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { bencher.proxy()->target_execution_policy.algo.reset(); param.compute_mode = param::Convolution::ComputeMode::FLOAT32; bencher.set_param(param); - auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; + auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS; printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " "float16: %.2fms %.2fGB/s " diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index e2120a18d..f08441e87 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { ConvBiasForward::algo_name( "CUDA:GROUP_CONV", {}) .c_str(), - {{"CUDNN", {}}}})); + {{"DEFAULT:CUDNN", {}}}})); ConvBias::Param param; param.sparse = ConvBias::Param::Sparse::GROUP; param.nonlineMode = mode; -- GitLab