/** * \file dnn/src/fallback/conv_bias/algos.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "src/fallback/conv_bias/algos.h" #include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/winograd/strategy.h" #include "src/naive/convolution/helper.h" #include "midout.h" using namespace megdnn; using namespace fallback; namespace { param::Convolution get_param_convolution(const param::ConvBias param) { param::Convolution ret{param.mode, param.pad_h, param.pad_w, param.stride_h, param.stride_w, param.dilate_h, param.dilate_w, param::Convolution::Sparse::DENSE, param.format}; return ret; } TensorLayoutArray get_layouts(const param::ConvBias& param, const ConvBiasImpl::NCBKernSizeParam& p) { megdnn_assert(param.format == param::ConvBias::Format::NCHW); UNPACK_CONV_NCB_KERN_SIZES(p); MEGDNN_MARK_USED_VAR(SH); MEGDNN_MARK_USED_VAR(SW); MEGDNN_MARK_USED_VAR(PH); MEGDNN_MARK_USED_VAR(PW); MEGDNN_MARK_USED_VAR(OW); MEGDNN_MARK_USED_VAR(OH); MEGDNN_MARK_USED_VAR(N); TensorLayout src_layout({1, IC, IH, IW}, p.src_type); TensorLayout filter_layout({OC, IC, FH, FW}, p.filter_type); TensorLayout bias_layout{{}, p.bias_type}; if (p.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { bias_layout = TensorLayout({1, OC, 1, 1}, p.bias_type); } else if (p.bias_mode == BiasMode::BIAS) { bias_layout = TensorLayout({1, OC, OH, OW}, p.bias_type); } TensorLayout dst_layout = TensorLayout({1, OC, OH, OW}, p.dst_type); return {src_layout, filter_layout, bias_layout, dst_layout}; } void kern_default(param::ConvBias param, const ConvBiasImpl::NCBKernParam& p) { dt_byte* workspace_ptr = static_cast(p.workspace_ptr); auto filter_meta_ptr = reinterpret_cast( &p.filter_meta); auto filter_meta = *filter_meta_ptr; auto layouts = get_layouts(param, p); TensorND src{reinterpret_cast(const_cast(p.src_ptr)), layouts[0]}; TensorND filter{const_cast(p.filter_ptr), layouts[1]}; auto bias_ptr = reinterpret_cast(const_cast(p.bias_ptr)); TensorND bias{bias_ptr, layouts[2]}; TensorND dst{reinterpret_cast(const_cast(p.dst_ptr)), layouts[3]}; auto sfb = dst; if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { // intermediate result sfb = TensorND{workspace_ptr, TensorLayout{dst.layout, bias.layout.dtype}}; } #define DISPATCH_RAW(in_dt, bias_dt, out_dt, cmode, func) \ else if (src.layout.dtype.enumv() == DTypeTrait::enumv && \ filter.layout.dtype.enumv() == DTypeTrait::enumv && \ (!bias.layout.dtype.valid() || \ bias.layout.dtype.enumv() == \ DTypeTrait::enumv) && \ sfb.layout.dtype.enumv() == DTypeTrait::enumv && \ param.compute_mode == param::ConvBias::ComputeMode::cmode) { \ func(src, filter, bias, sfb, workspace_ptr, filter_meta); \ } #define DISPATCH(in_dt, out_dt) \ DISPATCH_RAW(in_dt, out_dt, out_dt, DEFAULT, \ (megdnn::naive::convolution::forward_bias< \ DTypeTrait::ctype, \ DTypeTrait::ctype, \ DTypeTrait::ctype, \ DTypeTrait::ctype>)) if (0) { } DISPATCH(Float32, Float32) DISPATCH(Int8, Int16) DISPATCH(Int8, Int32) DISPATCH(QuantizedS8, QuantizedS32) DISPATCH(Quantized8Asymm, QuantizedS32) #if !MEGDNN_DISABLE_FLOAT16 DISPATCH(Float16, Float16) DISPATCH_RAW( Float16, Float16, Float16, FLOAT32, (megdnn::naive::convolution::forward_bias)) #endif else { megdnn_throw( ssprintf("unsupported naive ConvBias(%s, %s, %s) -> %s", src.layout.dtype.name(), filter.layout.dtype.name(), bias.layout.dtype.name(), dst.layout.dtype.name())); } #undef DISPATCH #undef DISPATCH_RAW auto res = sfb; using NonlineMode = param::ConvBias::NonlineMode; switch (param.nonlineMode) { #define cb(_mode) \ case NonlineMode::_mode: { \ if (res.layout.dtype.category() != DTypeCategory::QUANTIZED) { \ auto nonlinear = \ inplace_cpu_handle()->create_operator(); \ nonlinear->param().mode = Elemwise::Param::Mode::_mode; \ nonlinear->exec({res}, dst); \ } else { \ auto nonlinear = inplace_cpu_handle() \ ->create_operator(); \ nonlinear->param().mode = \ ElemwiseMultiType::Param::Mode::Q##_mode; \ nonlinear->exec({res}, dst); \ } \ break; \ } cb(RELU); cb(H_SWISH); #undef cb case NonlineMode::SIGMOID: { megdnn_assert(res.layout.dtype.category() != DTypeCategory::QUANTIZED); auto nonlinear = inplace_cpu_handle()->create_operator(); nonlinear->param().mode = Elemwise::Param::Mode::SIGMOID; nonlinear->exec({res}, res); if (res.raw_ptr != dst.raw_ptr) { inplace_cpu_handle()->create_operator()->exec(res, dst); } break; } case NonlineMode::IDENTITY: { if (res.raw_ptr != dst.raw_ptr) { inplace_cpu_handle()->create_operator()->exec(res, dst); } break; } default: megdnn_assert(false); } } } // namespace MIDOUT_DECL(megdnn_fallback_naive) /* ======================= AlgoNaive ======================== */ bool ConvBiasImpl::AlgoNaive::usable( ConvBiasImpl* opr, const NCBKernSizeParam&, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MIDOUT_BEGIN(megdnn_fallback_naive, 0) { return opr->param().format == param::ConvBias::Format::NCHW; } MIDOUT_END(); return false; } size_t ConvBiasImpl::AlgoNaive::get_workspace(ConvBiasImpl* opr, const NCBKernSizeParam& p) const { MIDOUT_BEGIN(megdnn_fallback_naive, 1) { auto layouts = get_layouts(opr->param(), p); //! When group>1 or n>1, this algo will parallel by group and n size_t nr_threads = p.nr_threads; auto conv_opr = inplace_cpu_handle()->create_operator(); conv_opr->param() = get_param_convolution(opr->param()); if (p.dst_type.enumv() == DTypeEnum::QuantizedS8 || p.dst_type.enumv() == DTypeEnum::Quantized8Asymm) { TensorLayout conv_dst_layout; conv_opr->deduce_layout(layouts[0], layouts[1], conv_dst_layout); WorkspaceBundle bundle(nullptr, {conv_dst_layout.span().dist_byte()}); return bundle.total_size_in_bytes() * nr_threads; } return 0; } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoNaive::dispatch_kerns( ConvBiasImpl* opr, const NCBKernSizeParam& p) const { param::ConvBias opr_param = opr->param(); size_t workspace_size = get_workspace(opr, p); //! When group>1 or n>1, this algo will parallel by group and n size_t nr_threads = p.nr_threads; size_t GROUP = p.filter_meta.group; size_t N = p.n; size_t workspace_per_thread = workspace_size / nr_threads; auto kern = [opr_param, workspace_per_thread]( const NCBKernParam& param, const NCBKernIndex& ncb_index) { MIDOUT_BEGIN(megdnn_fallback_naive, 2) { size_t group_id = ncb_index.ndrange_id[0]; size_t batch_id = ncb_index.ndrange_id[1]; size_t thread_id = ncb_index.thread_id; auto thread_param = param; thread_param.workspace_ptr = reinterpret_cast( reinterpret_cast(param.workspace_ptr) + thread_id * workspace_per_thread); thread_param.filter_ptr = param.filter(group_id); thread_param.dst_ptr = param.dst(batch_id, group_id); thread_param.src_ptr = param.src(batch_id, group_id); thread_param.bias_ptr = param.bias(batch_id, group_id); kern_default(opr_param, thread_param); } MIDOUT_END(); }; return {{kern, {GROUP, N, 1_z}}}; } MIDOUT_DECL(megdnn_fallback_winograd) /* ======================= AlgoWinogradF32 ======================== */ bool ConvBiasImpl::AlgoWinogradF32::usable( ConvBiasImpl* opr, const NCBKernSizeParam& param, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MEGDNN_MARK_USED_VAR(param); MEGDNN_MARK_USED_VAR(opr); MIDOUT_BEGIN(megdnn_fallback_winograd, 1, 0) { using Strategy = fallback::winograd::winograd_2x3_1x1_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = megdnn::winograd::ConvBias( strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == param::ConvBias::Format::NCHW_WINOGRAD && opr->param().output_block_size == 2 && param.winograd_matmul_format == param::MatrixMul::Format::DEFAULT)) && opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && param.filter_meta.spatial[0] == 3) && (param.filter_meta.stride[0] == param.filter_meta.stride[1] && param.filter_meta.stride[0] == 1) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && param.src_type.enumv() == DTypeEnum::Float32; } MIDOUT_END(); return false; } size_t ConvBiasImpl::AlgoWinogradF32::get_workspace( ConvBiasImpl*, const NCBKernSizeParam& p) const { MEGDNN_MARK_USED_VAR(p); MIDOUT_BEGIN(megdnn_fallback_winograd, 1, 1) { fallback::winograd::winograd_2x3_1x1_f strategy( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_f>( strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoWinogradF32::dispatch_kerns( ConvBiasImpl*, const NCBKernSizeParam& param) const { MEGDNN_MARK_USED_VAR(param); MIDOUT_BEGIN(megdnn_fallback_winograd, 1, 2) { fallback::winograd::winograd_2x3_1x1_f strategy( param.src_type, param.filter_type, param.dst_type); auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_f>(strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); return {}; } /* ======================= AlgoWinogradF32 4x4 ======================== */ bool ConvBiasImpl::AlgoWinogradF32_4x4::usable( ConvBiasImpl* opr, const NCBKernSizeParam& param, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MEGDNN_MARK_USED_VAR(param); MEGDNN_MARK_USED_VAR(opr); MIDOUT_BEGIN(megdnn_fallback_winograd, 2, 0) { if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) return false; using Strategy = fallback::winograd::winograd_2x3_4x4_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = megdnn::winograd::ConvBias( strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == param::ConvBias::Format::NCHW_WINOGRAD && opr->param().output_block_size == 2 && param.winograd_matmul_format == param::MatrixMul::Format::MK4)) && opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && param.filter_meta.spatial[0] == 3) && (param.filter_meta.stride[0] == param.filter_meta.stride[1] && param.filter_meta.stride[0] == 1) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && param.src_type.enumv() == DTypeEnum::Float32; } MIDOUT_END(); return false; } size_t ConvBiasImpl::AlgoWinogradF32_4x4::get_workspace( ConvBiasImpl*, const NCBKernSizeParam& p) const { MEGDNN_MARK_USED_VAR(p); MIDOUT_BEGIN(megdnn_fallback_winograd, 2, 1) { fallback::winograd::winograd_2x3_4x4_f strategy( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_4x4_f, param::MatrixMul::Format::MK4>(strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoWinogradF32_4x4::dispatch_kerns( ConvBiasImpl*, const NCBKernSizeParam& param) const { MEGDNN_MARK_USED_VAR(param); MIDOUT_BEGIN(megdnn_fallback_winograd, 2, 2) { fallback::winograd::winograd_2x3_4x4_f strategy( param.src_type, param.filter_type, param.dst_type); auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_4x4_f, param::MatrixMul::Format::MK4>(strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); return {}; } /* ======================= AlgoWinogradQS8 ======================== */ bool ConvBiasImpl::AlgoWinogradQS8::usable( ConvBiasImpl* opr, const NCBKernSizeParam& param, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MEGDNN_MARK_USED_VAR(param); MEGDNN_MARK_USED_VAR(opr); MIDOUT_BEGIN(megdnn_fallback_winograd, 3, 0) { using Strategy = fallback::winograd::winograd_2x3_1x1_qs8; Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = megdnn::winograd::ConvBias( strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == param::ConvBias::Format::NCHW_WINOGRAD && opr->param().output_block_size == 2 && param.winograd_matmul_format == param::MatrixMul::Format::DEFAULT)) && opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && param.filter_meta.spatial[0] == 3) && (param.filter_meta.stride[0] == param.filter_meta.stride[1] && param.filter_meta.stride[0] == 1) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && param.src_type.enumv() == DTypeEnum::QuantizedS8; } MIDOUT_END(); return false; } size_t ConvBiasImpl::AlgoWinogradQS8::get_workspace( ConvBiasImpl*, const NCBKernSizeParam& p) const { MEGDNN_MARK_USED_VAR(p); MIDOUT_BEGIN(megdnn_fallback_winograd, 3, 1) { fallback::winograd::winograd_2x3_1x1_qs8 strategy( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_qs8>( strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoWinogradQS8::dispatch_kerns( ConvBiasImpl*, const NCBKernSizeParam& param) const { MEGDNN_MARK_USED_VAR(param); MIDOUT_BEGIN(megdnn_fallback_winograd, 3, 2) { fallback::winograd::winograd_2x3_1x1_qs8 strategy( param.src_type, param.filter_type, param.dst_type); auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_qs8>( strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); return {}; } /* ======================= AlgoWinogradQS8 8x8 ======================== */ bool ConvBiasImpl::AlgoWinogradQS8_8x8::usable( ConvBiasImpl* opr, const NCBKernSizeParam& param, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MEGDNN_MARK_USED_VAR(param); MEGDNN_MARK_USED_VAR(opr); MIDOUT_BEGIN(megdnn_fallback_winograd, 4, 0) { if (param.filter_meta.icpg % 8 != 0 || param.filter_meta.ocpg % 8 != 0) return false; using Strategy = fallback::winograd::winograd_2x3_8x8_qs8; Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = megdnn::winograd::ConvBias( strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == param::ConvBias::Format::NCHW_WINOGRAD && opr->param().output_block_size == 2 && param.winograd_matmul_format == param::MatrixMul::Format::MK8)) && opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && param.filter_meta.spatial[0] == 3) && (param.filter_meta.stride[0] == param.filter_meta.stride[1] && param.filter_meta.stride[0] == 1) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && param.src_type.enumv() == DTypeEnum::QuantizedS8; } MIDOUT_END(); return false; } size_t ConvBiasImpl::AlgoWinogradQS8_8x8::get_workspace( ConvBiasImpl*, const NCBKernSizeParam& p) const { MEGDNN_MARK_USED_VAR(p); MIDOUT_BEGIN(megdnn_fallback_winograd, 4, 1) { fallback::winograd::winograd_2x3_8x8_qs8 strategy( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_8x8_qs8, param::MatrixMul::Format::MK8>(strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoWinogradQS8_8x8::dispatch_kerns( ConvBiasImpl*, const NCBKernSizeParam& param) const { MEGDNN_MARK_USED_VAR(param); MIDOUT_BEGIN(megdnn_fallback_winograd, 4, 2) { fallback::winograd::winograd_2x3_8x8_qs8 strategy( param.src_type, param.filter_type, param.dst_type); auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_8x8_qs8, param::MatrixMul::Format::MK8>(strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); return {}; } // vim: syntax=cpp.doxygen