diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index a95a96d46580d442898e589a42c887e5d066c8c9..21b3c6e4ef573c2550ee60c7720b6897a18464ac 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -428,6 +428,11 @@ public: void exec(const ExecArgs& args) const override; const char* name() const override { return m_name.c_str(); } bool is_reproducible() const override { return true; } + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, @@ -560,6 +565,11 @@ public: const char* name() const override { return m_name.c_str(); } bool is_reproducible() const override { return true; } static std::string to_string(AlgoParam algo_param); + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 7f42126cada2850584125cc48dcb9209c6385e7d..80fd2d35f0ac915a21a837bb2e1462441f29b1f4 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -65,8 +65,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( WorkspaceBundle ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::get_workspace_bundle( dt_byte* raw_ptr, const SizeArgs& args) const { - size_t ws_filter = args.filter_layout->span().dist_byte(); - return WorkspaceBundle{raw_ptr, {ws_filter}}; + if (args.preprocessed_filter) { + return WorkspaceBundle{raw_ptr, {}}; + } else { + size_t ws_filter = args.filter_layout->span().dist_byte(); + return WorkspaceBundle{raw_ptr, {ws_filter}}; + } } size_t @@ -82,12 +86,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( auto&& fm = args.filter_meta; UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); - auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); - auto ws_filter = ws.get(0); auto&& stream = cuda_stream(args.opr->handle()); - // reformat filter from nchw32 to chwn32 - { + int8_t* filter_ptr = nullptr; + if (args.preprocessed_filter == nullptr) { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + // reformat filter from nchw32 to chwn32 TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; src.init_contiguous_stride(); TensorLayout dst = src; @@ -99,11 +103,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( TensorND ts_src, ts_dst; ts_src.raw_ptr = args.filter_tensor->raw_ptr; ts_src.layout = src; - ts_dst.raw_ptr = ws_filter; + ts_dst.raw_ptr = args.workspace.raw_ptr; ts_dst.layout = dst; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); + } else { + filter_ptr = reinterpret_cast( + args.preprocessed_filter->tensors[0].raw_ptr); } ConvParam kern_param; @@ -131,8 +138,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< - false>(args.src_tensor->compatible_ptr(), - reinterpret_cast(ws_filter), + false>(args.src_tensor->compatible_ptr(), filter_ptr, args.bias_tensor->compatible_ptr(), z_dev_ptr, args.dst_tensor->compatible_ptr(), nullptr, kern_param, nonlinear_mode, alpha, beta, gamma, @@ -146,8 +152,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( stream); } else { cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32( - args.src_tensor->compatible_ptr(), - reinterpret_cast(ws_filter), + args.src_tensor->compatible_ptr(), filter_ptr, args.bias_tensor->compatible_ptr(), z_dev_ptr, args.dst_tensor->compatible_ptr(), nullptr, kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, @@ -167,6 +172,41 @@ std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string( algo_param.threadblock_n, algo_param.threadblock_k, algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); } + +size_t ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0_z; +} + +SmallVector ConvBiasForwardImpl:: + AlgoInt8NCHW32IMMAImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 32; + dst.stride[1] = co * fh * fw * 32; + dst.stride[2] = co * fw * 32; + dst.stride[3] = co * 32; + dst.stride[4] = 1; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.filter_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + ts_dst.layout = dst; + auto&& transpose = args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); +} #endif // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index cf656801450c36840cbd77bee4f9c1e10b4fe94a..22451bf301a42c4775a63783c6de0da83f95f298 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -62,8 +62,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( WorkspaceBundle ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle( dt_byte* raw_ptr, const SizeArgs& args) const { - size_t ws_filter = args.filter_layout->span().dist_byte(); - return WorkspaceBundle{raw_ptr, {ws_filter}}; + if (args.preprocessed_filter) { + return WorkspaceBundle{raw_ptr, {}}; + } else { + size_t ws_filter = args.filter_layout->span().dist_byte(); + return WorkspaceBundle{raw_ptr, {ws_filter}}; + } } size_t @@ -79,12 +83,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( auto&& fm = args.filter_meta; UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); - auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); - auto ws_filter = ws.get(0); auto&& stream = cuda_stream(args.opr->handle()); - // reformat filter from nchw4 to chwn4 - { + int8_t* filter_ptr = nullptr; + if (args.preprocessed_filter == nullptr) { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + // reformat filter from nchw4 to chwn4 TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; src.init_contiguous_stride(); TensorLayout dst = src; @@ -92,11 +96,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( TensorND ts_src, ts_dst; ts_src.raw_ptr = args.filter_tensor->raw_ptr; ts_src.layout = src; - ts_dst.raw_ptr = ws_filter; + ts_dst.raw_ptr = args.workspace.raw_ptr; ts_dst.layout = dst; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); + } else { + filter_ptr = reinterpret_cast( + args.preprocessed_filter->tensors[0].raw_ptr); } convolution::ConvParam kern_param; @@ -124,8 +131,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), - reinterpret_cast(ws_filter), + args.src_tensor->compatible_ptr(), filter_ptr, args.bias_tensor->compatible_ptr(), z_dev_ptr, args.dst_tensor->compatible_ptr(), nullptr, kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, @@ -138,8 +144,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( stream); } else { cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), - reinterpret_cast(ws_filter), + args.src_tensor->compatible_ptr(), filter_ptr, args.bias_tensor->compatible_ptr(), z_dev_ptr, args.dst_tensor->compatible_ptr(), nullptr, kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, @@ -153,4 +158,35 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( } } +size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0_z; +} + +SmallVector ConvBiasForwardImpl:: + AlgoInt8NCHW4DotProdImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.filter_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + ts_dst.layout = dst; + auto&& transpose = args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); +} + // vim: syntax=cpp.doxygen diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index f6d588f50770495b5ca4698b7491ace93a1ce9f9..5a3560c77e1bc085958a80f4b637a7d50397abea 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1084,6 +1084,42 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { } +TEST_F(CUDA, CUTLASS_WEIGHT_PREPROCESS) { + require_compute_capability(6, 1); + Checker> checker( + handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-16, 16}; + UniformIntRNG bias_rng{-50, 50}; + UniformIntRNG const_rng{1, 1}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{1.3f}) + .set_dtype(4, dtype::QuantizedS8{1.0f}) + .set_epsilon(1 + 1e-3) + .set_max_avg_error(1e-1) + .set_max_avg_biased_error(1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 2; + param.format = param::ConvBias::Format::NCHW4; + checker.set_param(param).execs({{16, 4, 14, 14, 4}, + {16, 4, 3, 3, 4}, + {1, 4, 1, 1, 4}, + {}, + {}}); + }; + check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32"); + check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8"); +} + #if CUDA_VERSION >= 10020 /// \note: we only check several cases and block sizes in megdnn_test, the /// full testcases are written in cutlass repository