diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index e100417c75c14fe31a2d4a2c2037c8399c41330b..014bbde9087dd04cc54b758a172b0ddcf510aaa0 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -260,16 +260,17 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; - int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{128, 64, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{64, 64, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{32, 64, 32, 32, 64, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{64, 32, 32, 64, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{32, 32, 32, 32, 32, 32}); - int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); + int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{128, 64, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{64, 64, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{32, 64, 32, 32, 64, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{64, 32, 32, 64, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{32, 32, 32, 32, 32, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2}); } diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 8325548f4f32d899e519640a3eeea5a613147cdd..a95a96d46580d442898e589a42c887e5d066c8c9 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -407,15 +407,16 @@ public: int warp_m; int warp_n; int warp_k; + int stage; std::string to_string() { /// default algorithm if (threadblock_m == 128 && threadblock_n == 128 && threadblock_k == 32 && warp_m == 32 && warp_n == 64 && - warp_k == 32) { + warp_k == 32 && stage == 2) { return ""; } - return ssprintf("_%dX%dX%d_%dX%dX%d", threadblock_m, threadblock_n, - threadblock_k, warp_m, warp_n, warp_k); + return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, + threadblock_k, warp_m, warp_n, warp_k, stage); } }; AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param) diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 832e12281008e045a75a22d2e198aed6872f56ab..fd840927240eeadba049eb1fceaeadef9291deb3 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -172,7 +172,7 @@ void megdnn::cuda::cutlass_wrapper:: const GemmCoord& warp_shape, cudaStream_t stream) { #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ threadblock_k_, warp_m_, warp_n_, \ - warp_k_, aligned_) \ + warp_k_, stage_, aligned_) \ if (threadblock_shape.m() == threadblock_m_ && \ threadblock_shape.n() == threadblock_n_ && \ threadblock_shape.k() == threadblock_k_ && \ @@ -194,7 +194,7 @@ void megdnn::cuda::cutlass_wrapper:: cutlass::convolution::threadblock:: \ ConvolutionNCxHWxThreadblockSwizzle< \ cutlass::convolution::ConvType::kConvolution>, \ - 2, 4, aligned_, NeedLoadFromConstMem>; \ + stage_, 4, aligned_, NeedLoadFromConstMem>; \ typename Convolution::ConvolutionParameter conv_param{ \ param.n, param.ci, param.co, param.hi, param.wi, \ param.fh, param.fw, param.ho, param.wo, param.sh, \ @@ -204,16 +204,17 @@ void megdnn::cuda::cutlass_wrapper:: epilogue, stream); \ } #define DISPATCH_KERNEL \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ megdnn_assert(false, \ "unsupported threadblock shape (%dx%dx%d) and warp shape " \ "(%dx%dx%d)", \ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..ab01f989dadf003d7073fb228910bdc6050460b8 Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu differ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..9f901437080cba0a608bbc613716d45356c1c870 Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu differ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..5dfd371c8bdcd07754f633083d69edd5c36850ef Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu differ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..1c7115e9d9b4e077f0b265463b4c45710751ec6f Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu differ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..374f51e9660a374dd54912933f94785123c5e458 Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu differ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..76ecaad0dd28cb71a79c83b7470737e24741710e Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu differ diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 3173be121ff909e0032bc31b4fbe714fe5ca7f7f..f6d588f50770495b5ca4698b7491ace93a1ce9f9 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -97,6 +97,13 @@ std::vector get_detection_bench_args(size_t batch = 16) { return args; } +std::vector get_det_first_bench_args(size_t batch = 16) { + std::vector args; + args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2}); + args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); + return args; +} + void benchmark_target_algo( Handle* handle, const std::vector& args, DType src_dtype, DType filter_dtype, DType bias_dtype, DType dst_dtype, @@ -1236,6 +1243,28 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) { dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4); } + +TEST_F(CUDA, BENCHMARK_SASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) { + require_compute_capability(6, 1); + std::string algo = ConvBias::algo_name( + "SASS_INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32_64", + ConvBias::DirectParam{}); + benchmark_target_algo(handle_cuda(), get_det_first_bench_args(16), + dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, + dtype::QuantizedS32{1.2f * 1.3f}, + dtype::QuantizedS8{1.0f}, algo.c_str(), + param::ConvBias::Format::NCHW4); +} + +TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) { + require_compute_capability(6, 1); + benchmark_target_algo( + handle_cuda(), get_det_first_bench_args(16), + dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, + dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16", param::ConvBias::Format::NCHW4); +} + #endif } // namespace test } // namespace megdnn diff --git a/third_party/cutlass b/third_party/cutlass index 5a7f4bfa0e57f92140c8236322a86730132e0847..41426ea4074dcfc448b1c9979ea7617407590c04 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit 5a7f4bfa0e57f92140c8236322a86730132e0847 +Subproject commit 41426ea4074dcfc448b1c9979ea7617407590c04