提交 edd7e167 编写于 作者: M Megvii Engine Team 提交者: Xu Xinran

feat(dnn/fallback): add im2col filterpreprocess function

GitOrigin-RevId: 61c54ad258a42301711d3efdae0caef47d7b0584
上级 9e9e8ca0
......@@ -31,35 +31,10 @@ using namespace im2col;
* *Through witch can convenient get the needed ptr
*/
struct Im2colBundelIndex {
static constexpr size_t BUNDLE_PADDING_INDEX = 0_z;
static constexpr size_t BUNDLE_PACKA_INDEX = 1_z;
static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
};
using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
//! Process one input channel copy padding
static void copy_padding_kern(WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
StrategyBase* im2colstrategy, size_t pack_oc_size) {
im2colstrategy->copy_padding_kern(bundle, param, ncb_index, pack_oc_size);
}
//! packA_kern
static void packA_kern(
WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
StrategyBase* im2colstrategy,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
size_t pack_oc_size) {
im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo,
ncb_index, matmul_desc, pack_oc_size);
}
/*!
* *\brief Im2colKerns collects all the im2col kerns in it
*/
......@@ -124,8 +99,8 @@ public:
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
......@@ -205,8 +180,8 @@ public:
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
......@@ -288,8 +263,8 @@ public:
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
fallback::MatrixMulImpl::KernSizeParam im2col_kern_param,
MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
......@@ -322,15 +297,16 @@ public:
}
};
fallback::MatrixMulImpl::KernSizeParam
ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param,
size_t ohw_tile_size,
size_t oc_tile_size) const {
namespace {
static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
size_t ohw_tile_size, size_t oc_tile_size) {
auto format = param::MatrixMul::Format::DEFAULT;
size_t pack_oc_size = pack_size(param.filter_meta.format);
if (param.filter_meta.format == param::ConvBias::Format::NCHW44) {
format = param::MatrixMul::Format::MK4;
} else if(param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT){
} else if (param.filter_meta.format ==
param::ConvBias::Format::NCHW44_DOT) {
format = param::MatrixMul::Format::MK4_DOT;
}
size_t M = oc_tile_size;
......@@ -358,10 +334,23 @@ ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param,
format};
}
void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block(
const NCBKernSizeParam& param, size_t& oc_tile_size,
size_t& ohw_tile_size, size_t block_m, size_t block_n,
fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const {
static void choice_ohw_oc_block(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
size_t& oc_tile_size, size_t& ohw_tile_size, size_t block_m,
size_t block_n, const size_t m_ohw_tile_size,
fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) {
//! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
//! when ohw_tile_size < this value ohw_tile_size = ohw
static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
//! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
//! oc_tile_size = DEFAULT_OC_TILE_SIZE
static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
//! when oc_tile_size > this value m_oc_tile_size =
//! DEFAULT_OC_MAX_TILE_SIZE
static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
//! when oc_tile_size < this value oc_tile_size =
//! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
size_t nr_threads = param.nr_threads;
size_t OC = param.filter_meta.ocpg;
size_t ohw = param.osz[0] * param.osz[1];
......@@ -393,8 +382,74 @@ void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block(
}
}
WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
const NCBKernSizeParam& param) const {
static size_t packA_group_size(
const MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::KernSizeParam& matmul_param,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
size_t packa_parallel_times) {
if (matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
return matmul_algo->get_bundle(matmul_param).get_size(0);
} else if (matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
return packa_parallel_times *
matmul_algo->get_bundle(matmul_param).get_size(0);
}
megdnn_assert(matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK);
//! nopack mode return 0;
return 0;
}
static WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::KernSizeParam& matmul_param,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
size_t oc_tile_size, size_t ohw_tile_size) {
if (matmul_desc.packmode == Pack_Mode::DEFAULT) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) {
Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
return defaultkern.get_thread_bundle(param, matmul_param,
matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
} else if (matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::get_bundle_onlypacka"_hash)) {
Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
return onlypackakern.get_thread_bundle(param, matmul_param,
matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
} else {
megdnn_assert(matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK);
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::get_thread_bundle_nopack"_hash)) {
Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
return nopackkern.get_thread_bundle(param, matmul_param,
matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
}
return {nullptr, {}};
}
static WorkspaceBundle get_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size,
size_t ohw_tile_size) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(OC);
MEGDNN_MARK_USED_VAR(OH);
......@@ -410,23 +465,20 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
size_t padding = 0, packa_size = 0, packa_group_size = 0;
size_t nr_threads = param.nr_threads;
size_t GROUP = param.filter_meta.group;
fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
m_matmul_algo->matmul_description();
bool need_pack = mdesc.packmode == Pack_Mode::DEFAULT;
bool only_packA = mdesc.packmode == Pack_Mode::ONLY_PACKA;
size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
mdesc.innerblocksize.m, mdesc.innerblocksize.n,
mdesc.packmode);
if (need_pack || only_packA) {
auto im2col_kern_param = get_matmul_kern_param(
param, ohw_tile_size, only_packA ? oc_tile_size : OC);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param);
packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0)
: wb.get_size(0);
} else { //! not support pack,not need pack
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
matmul_algo->matmul_description();
bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
//! packmode is default should use oc
//! packmode is onlypackA should use oc_tile_size
auto im2col_kern_param = get_matmul_kern_param(
param, ohw_tile_size, default_pack ? OC : oc_tile_size);
if (is_enable_filter_preprocess(param)) {
packa_group_size = 0;
} else {
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
packa_group_size = packA_group_size(matmul_algo, im2col_kern_param,
matmul_desc, oc_parallel_times);
}
if (no_need_pading) {
......@@ -437,50 +489,27 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
}
packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size
WorkspaceBundle ws = {nullptr, {}};
auto im2col_kern_param =
get_matmul_kern_param(param, ohw_tile_size, oc_tile_size);
if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) {
Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
ws = defaultkern.get_thread_bundle(param, im2col_kern_param,
m_matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
} else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_packa"_hash)) {
Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
ws = onlypackakern.get_thread_bundle(param, im2col_kern_param,
m_matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
} else {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_other"_hash)) {
Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
ws = nopackkern.get_thread_bundle(param, im2col_kern_param,
m_matmul_algo, ohw_tile_size,
oc_tile_size);
}
MIDOUT_END();
}
WorkspaceBundle ws =
get_thread_bundle(param, matmul_algo, im2col_kern_param,
matmul_desc, oc_tile_size, ohw_tile_size);
return {nullptr,
{padding, packa_size, ws.total_size_in_bytes() * nr_threads}};
}
} // namespace
size_t ConvBiasImpl::AlgoIm2col::get_workspace(
const NCBKernSizeParam& p) const {
MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) {
return get_bundle(p).total_size_in_bytes();
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size,
matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n,
m_ohw_tile_size, matmul_desc.packmode);
return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size)
.total_size_in_bytes();
}
MIDOUT_END();
return 0;
......@@ -499,22 +528,21 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
size_t oc_tile_size = 0, ohw_tile_size = 0;
size_t ohw = OH * OW;
size_t GROUP = param.filter_meta.group;
WorkspaceBundle bundle = get_bundle(param);
WorkspaceBundle bundle_thread = {nullptr, {}};
bool need_padding = (PH != 0 || PW != 0);
fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
Pack_Mode packmode = mdesc.packmode;
bool default_pack = packmode == Pack_Mode::DEFAULT;
bool no_pack = packmode == Pack_Mode::NO_PACK;
bool only_packA = packmode == Pack_Mode::ONLY_PACKA;
bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK;
bool only_packA = matmul_desc.packmode == Pack_Mode::ONLY_PACKA;
bool enable_filter_preprocess = is_enable_filter_preprocess(param);
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
mdesc.innerblocksize.m, mdesc.innerblocksize.n,
mdesc.packmode);
matmul_desc.innerblocksize.m,
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);
WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size);
size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
size_t packa_parallel_times = 0;
......@@ -523,28 +551,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
if (only_packA) {
packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
} else if (default_pack) {
packa_parallel_times = div_ceil<size_t>(OC, mdesc.innerblocksize.m);
packa_parallel_times =
div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
}
auto matmul_param = get_matmul_kern_param(
param, ohw_tile_size, only_packA ? oc_tile_size : OC);
if (mdesc.packmode == Pack_Mode::DEFAULT) {
Im2colKerns<Pack_Mode::DEFAULT> defaultkern;
bundle_thread = defaultkern.get_thread_bundle(
param, matmul_param, m_matmul_algo, ohw_tile_size,
oc_tile_size);
} else if (mdesc.packmode == Pack_Mode::ONLY_PACKA) {
Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern;
bundle_thread = onlypackakern.get_thread_bundle(
param, matmul_param, m_matmul_algo, ohw_tile_size,
oc_tile_size);
} else {
Im2colKerns<Pack_Mode::NO_PACK> nopackkern;
bundle_thread = nopackkern.get_thread_bundle(
param, matmul_param, m_matmul_algo, ohw_tile_size,
oc_tile_size);
}
param, ohw_tile_size, default_pack ? OC : oc_tile_size);
WorkspaceBundle bundle_thread =
get_thread_bundle(param, m_matmul_algo, matmul_param,
matmul_desc, oc_tile_size, ohw_tile_size);
StrategyParam strategyparam;
strategyparam.ohw = ohw;
strategyparam.is_dst_8bit =
......@@ -557,6 +573,9 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit;
strategyparam.oc_tile_size = oc_tile_size;
strategyparam.pack_oc_size = pack_oc_size;
strategyparam.enable_filter_preprocess = enable_filter_preprocess;
strategyparam.packA_group_size = packA_group_size(
m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
MIDOUT_BEGIN(
......@@ -569,88 +588,126 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
copy_padding_kern(bundle, param, ncb_index, im2colstrategy,
pack_oc_size);
im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
pack_oc_size);
};
auto kern_packA = [bundle, matmul_algo = m_matmul_algo,
matmul_param, im2colstrategy,
pack_oc_size = pack_oc_size, mdesc = mdesc](
strategyparam = strategyparam,
matmul_desc = matmul_desc](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index,
im2colstrategy, mdesc, pack_oc_size);
im2colstrategy->packA_kern(bundle, param, matmul_param,
matmul_algo, ncb_index, matmul_desc,
strategyparam);
};
if (default_pack) {
auto kern_compute_default =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
ohw_tile_size = ohw_tile_size,
strategyparam = strategyparam, matmul_desc = mdesc,
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::DEFAULT>::kerns(
bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam,
ncb_index, ohw_tile_size, im2colstrategy);
};
ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
if (need_padding) {
ret_kern.push_back({kern_padding,
{param.n, GROUP, IC / pack_oc_size}});
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) {
auto kern_compute_default =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
ohw_tile_size = ohw_tile_size,
strategyparam = strategyparam,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::DEFAULT>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (!enable_filter_preprocess) {
ret_kern.push_back(
{kern_packA, {GROUP, packa_parallel_times}});
}
if (need_padding) {
ret_kern.push_back(
{kern_padding,
{param.n, GROUP, IC / pack_oc_size}});
}
ret_kern.push_back({kern_compute_default,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
ret_kern.push_back(
{kern_compute_default,
{N, GROUP, ohw_parallel_times, oc_parallel_times}});
MIDOUT_END();
return {};
} else if (only_packA) {
auto kern_compute_onlypackA =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam,
ncb_index, ohw_tile_size, im2colstrategy);
};
ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
if (need_padding) {
ret_kern.push_back({kern_padding, {param.n, GROUP, IC}});
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) {
auto kern_compute_onlypackA =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (!enable_filter_preprocess) {
ret_kern.push_back(
{kern_packA, {GROUP, packa_parallel_times}});
}
if (need_padding) {
ret_kern.push_back(
{kern_padding, {param.n, GROUP, IC}});
}
ret_kern.push_back({kern_compute_onlypackA,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
ret_kern.push_back(
{kern_compute_onlypackA,
{N, GROUP, ohw_parallel_times, oc_parallel_times}});
MIDOUT_END();
return {};
} else if (no_pack) {
auto kern_compute_nopack =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::NO_PACK>::kerns(
bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam,
ncb_index, ohw_tile_size, im2colstrategy);
};
if (need_padding) {
ret_kern.push_back({kern_padding, {param.n, GROUP, IC}});
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) {
auto kern_compute_nopack =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::NO_PACK>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (need_padding) {
ret_kern.push_back(
{kern_padding, {param.n, GROUP, IC}});
}
ret_kern.push_back({kern_compute_nopack,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
ret_kern.push_back(
{kern_compute_nopack,
{N, GROUP, ohw_parallel_times, oc_parallel_times}});
MIDOUT_END();
return {};
}
return ret_kern;
return {};
}
MIDOUT_END();
return {};
......@@ -694,12 +751,19 @@ bool ConvBiasImpl::AlgoIm2col::usable(
return false;
}
}
fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc =
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
//! only matmul's packmode is packa or default support weight preprocess
if (is_enable_filter_preprocess(param) &&
(matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) {
return false;
}
if (format == param::ConvBias::Format::NCHW44 ||
format == param::ConvBias::Format::NCHW44_DOT) {
//! current NCHW44 im2col only support DEFAULT mode matmul
if (mdesc.packmode != Pack_Mode::DEFAULT) {
if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
return false;
//! nchw44 hybird mode and channel wise is not support
} else if (param.filter_meta.icpg < 4_z ||
......@@ -711,8 +775,9 @@ bool ConvBiasImpl::AlgoIm2col::usable(
size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
mdesc.innerblocksize.m, mdesc.innerblocksize.n,
m_matmul_algo->packmode());
matmul_desc.innerblocksize.m,
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);
fallback::MatrixMulImpl::KernSizeParam matmul_param =
get_matmul_kern_param(param, ohw_tile_size, oc_tile_size);
bool matmulusable = m_matmul_algo->usable(matmul_param);
......@@ -731,4 +796,104 @@ bool ConvBiasImpl::AlgoIm2col::usable(
return false;
}
SmallVector<TensorLayout>
ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) {
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
//! only support default_pack and only_packa mode
if (matmul_desc.packmode == Pack_Mode::NO_PACK) {
return {};
}
size_t GROUP = param.filter_meta.group;
bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
size_t OC = param.filter_meta.ocpg;
SmallVector<TensorLayout> preprocessed_layouts;
size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
matmul_desc.innerblocksize.m,
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);
auto matmul_param = get_matmul_kern_param(
param, ohw_tile_size, default_pack ? OC : oc_tile_size);
size_t packa_parallel_times = div_ceil<size_t>(
OC, default_pack ? matmul_desc.innerblocksize.m : oc_tile_size);
size_t packa_group_size = packA_group_size(
m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
preprocessed_layouts.push_back(
{{GROUP, packa_group_size}, dtype::Int8()});
return preprocessed_layouts;
}
MIDOUT_END();
return {};
}
SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 3) {
size_t OC = param.filter_meta.ocpg;
size_t oc_tile_size = 0, ohw_tile_size = 0;
size_t GROUP = param.filter_meta.group;
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
matmul_desc.innerblocksize.m,
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);
WorkspaceBundle bundle =
get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size);
Pack_Mode packmode = matmul_desc.packmode;
bool default_pack = packmode == Pack_Mode::DEFAULT;
bool only_packA = packmode == Pack_Mode::ONLY_PACKA;
size_t packa_parallel_times = 0;
if (only_packA) {
packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
} else if (default_pack) {
packa_parallel_times =
div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
} else {
//! if nopack return null so that OprWeightPreprocessProxy can run
//! with nopack mode
return {};
}
auto matmul_param = get_matmul_kern_param(
param, ohw_tile_size, default_pack ? OC : oc_tile_size);
StrategyParam strategyparam;
strategyparam.enable_filter_preprocess =
is_enable_filter_preprocess(param);
strategyparam.packA_group_size = packA_group_size(
m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
StrategyBase* im2colstrategy =
Factory::get_im2col_strategy(param, m_matmul_algo);
auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param,
im2colstrategy, strategyparam = strategyparam,
matmul_desc = matmul_desc](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->packA_kern(bundle, param, matmul_param, matmul_algo,
ncb_index, matmul_desc, strategyparam);
};
ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
return ret_kern;
}
MIDOUT_END();
return {};
}
// vim: syntax=cpp.doxygen
......@@ -22,27 +22,6 @@ namespace megdnn {
namespace fallback {
class ConvBiasImpl::AlgoIm2col final : public AlgoBase {
//! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
//! when m_oc_tile_size < this value m_oc_tile_size = ohw
static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
//! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
//! m_oc_tile_size = DEFAULT_OC_TILE_SIZE
static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
//! when m_oc_tile_size > this value m_oc_tile_size =
//! DEFAULT_OC_MAX_TILE_SIZE
static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
//! when m_oc_tile_size < this value m_oc_tile_size =
//! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
const NCBKernSizeParam& param, size_t ohw_tile_size,
size_t oc_tile_size) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
void choice_ohw_oc_block(
const NCBKernSizeParam& param, size_t& oc_tile_size,
size_t& ohw_tile_size, size_t block_m, size_t block_n,
fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const;
public:
AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size)
: m_matmul_algo(matmul_algo),
......@@ -59,10 +38,16 @@ public:
bool usable(const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(const NCBKernSizeParam& param) const override;
SmallVector<NCBKern> dispatch_kerns(
SmallVector<NCBKern> dispatch_kerns(const NCBKernSizeParam& param) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const NCBKernSizeParam& param) const override;
size_t get_preprocess_workspace(
const NCBKernSizeParam& /*param*/) const override {
return 0;
}
SmallVector<NCBKern> dispatch_preprocess_kerns(
const NCBKernSizeParam& param) const override;
bool is_preferred(
const NCBKernSizeParam& param) const override {
bool is_preferred(const NCBKernSizeParam& param) const override {
if (param.src_type.category() == DTypeCategory::QUANTIZED) {
static CpuOprDelegationStorage<1> storage;
auto conv_bias_opr = storage.get<ConvBias, 0>();
......
......@@ -40,9 +40,11 @@ struct StrategyParam {
size_t block_n;
size_t block_k;
size_t pack_oc_size;
size_t packA_group_size;
bool skip_copy_dst;
bool is_dst_8bit;
bool is_ohw_size_bigger;
bool enable_filter_preprocess;
};
class StrategyBase {
......@@ -62,7 +64,7 @@ public:
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desec,
size_t pack_size) = 0;
const StrategyParam& sparam) = 0;
virtual void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
......@@ -296,7 +298,7 @@ public:
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
size_t pack_size) override;
const StrategyParam& sparam) override;
virtual void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
......@@ -375,7 +377,7 @@ public:
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
size_t pack_size) override;
const StrategyParam& sparam) override;
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
......@@ -431,7 +433,7 @@ public:
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
size_t pack_size) override;
const StrategyParam& sparam) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
......
......@@ -25,19 +25,23 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
size_t) {
const StrategyParam& sparam) {
fallback::MatrixMulImpl::KernParam matmul_param;
size_t group_id = ncb_index.ndrange_id[0];
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmulparam;
size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
size_t packed_per_oc_block_size =
round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
matmul_desc.innerblocksize.m * matmul_desc.packa_type_size;
size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size;
int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
group_id * packA_group_size + a_panel_offset;
int8_t* tmp_ptr =
sparam.enable_filter_preprocess
? static_cast<int8_t*>(
param.preprocessed_filter->tensors[0].raw_ptr)
: static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
int8_t* a_panel =
tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset;
matmul_param.A_ptr =
const_cast<src_ctype*>(param.filter<src_ctype>(group_id));
matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1],
......@@ -149,15 +153,20 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
size_t packA_per_oc_block_size =
round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
sparam.oc_tile_size * matmul_desc.packa_type_size;
size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
size_t packA_group_size = sparam.packA_group_size;
size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size +
ncb_index.ndrange_id[3] * packA_per_oc_block_size;
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
src_ctype* a_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
a_panel_offset);
int8_t* tmp_ptr =
sparam.enable_filter_preprocess
? static_cast<int8_t*>(
param.preprocessed_filter->tensors[0].raw_ptr)
: static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
src_ctype* a_panel =
reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
src_ctype* b_panel =
reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>(
bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX)));
......
......@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::
MatmulDescription& /*matmul_dsec*/,
size_t) {
const StrategyParam&) {
MEGDNN_MARK_USED_VAR(bundle);
MEGDNN_MARK_USED_VAR(param);
MEGDNN_MARK_USED_VAR(matmulparam);
......
......@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::
MatmulDescription& /*matmul_desc*/,
size_t) {
const StrategyParam& sparam) {
fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmulparam;
......@@ -36,12 +36,17 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
size_t output_block_oc_size =
std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size);
size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size;
size_t packA_group_size =
bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
size_t a_panel_offset = ncb_index.ndrange_id[1] *
matmul_algo->get_bundle(matmul_param).get_size(0);
int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
group_id * packA_group_size + a_panel_offset;
int8_t* tmp_ptr =
sparam.enable_filter_preprocess
? static_cast<int8_t*>(
param.preprocessed_filter->tensors[0].raw_ptr)
: static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
int8_t* a_panel = tmp_ptr +
group_id * sparam.packA_group_size + a_panel_offset;
matmul_param.A_ptr =
const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) +
oc_cur_index * matmul_param.K;
......@@ -60,20 +65,22 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::
MatmulDescription& /*matmul_desc*/
) {
size_t packA_group_size =
bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
/*matmul_desc*/) {
size_t a_panel_offset = ncb_index.ndrange_id[3] *
matmul_algo->get_bundle(matmul_param).get_size(0);
a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset;
a_panel_offset =
sparam.group_id * sparam.packA_group_size + a_panel_offset;
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
src_ctype* a_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
a_panel_offset);
int8_t* tmp_ptr =
sparam.enable_filter_preprocess
? static_cast<int8_t*>(
param.preprocessed_filter->tensors[0].raw_ptr)
: static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
src_ctype* a_panel = reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
src_ctype* b_panel = nullptr;
src_ctype* im2col_dst = static_cast<src_ctype*>(
......
......@@ -154,7 +154,8 @@ void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout,
bias{nullptr, bias_layout};
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace,
preprocessed_filter);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
//! should not pass workspace_size limit otherwise can not find match algo
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
fparam) <= workspace.size) {
exec_preprocess_with_ncb_kern(fparam, algo);
......
......@@ -299,6 +299,11 @@ private:
const PreprocessedFilter* preprocessed_filter);
};
inline bool is_enable_filter_preprocess(
const ConvBiasImpl::NCBKernSizeParam& param) {
return param.preprocessed_filter &&
param.preprocessed_filter->tensors.size() >= 1;
}
} // namespace fallback
} // namespace megdnn
......
......@@ -109,7 +109,9 @@ void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout,
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout};
auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter,
workspace);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
//! should not pass workspace_size limit otherwise can not find match algo
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam);
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
fparam) <= workspace.size) {
exec_preprocess_with_ncb_kern(fparam, algo);
......
......@@ -1837,6 +1837,21 @@ void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2_PREPROCESS) {
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 2, false, false, false), \
handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
dtype::Float32(), dtype::Float32(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F32K8X12X1")
cb("IM2COLMATMUL:AARCH64_F32K4X16X1")
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_F32")
#endif
#undef cb
}
// clang-format off
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) {
#define cb(name) \
......@@ -1851,6 +1866,22 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) {
cb("IM2COLMATMUL:ARMV7_F32")
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1_PREPROCESS) {
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false), \
handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
dtype::Float32(), dtype::Float32(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F32K8X12X1")
cb("IM2COLMATMUL:AARCH64_F32K4X16X1")
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_F32")
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1) {
......@@ -1899,6 +1930,37 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess(get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \
false, true, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \
dtype::QuantizedS8(60.25f), name); \
check_conv_bias_preprocess( \
get_conv_bias_args({1}, 2, false, false, false, true, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \
dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD");
#else
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8");
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16");
#endif
#elif MEGDNN_ARMV7
epsilon = 1;
cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8");
#endif
#undef cb
}
#if __ARM_FEATURE_DOTPROD
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) {
......@@ -1924,6 +1986,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) {
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess(get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, \
false, false, false, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \
dtype::QuantizedS8(60.25f), name); \
checker_conv_bias( \
get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \
dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_S2_FUSE) {
UniformIntRNG rng{-50, 50};
......@@ -1968,6 +2053,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \
true, false, true, false, false, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name); \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \
false, false, true), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) {
UniformIntRNG rng{-50, 50};
......@@ -1992,6 +2102,30 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \
true, false, true, false, false, true), \
handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \
dtype::Int32(), {}, name); \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \
false, false, true), \
handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \
dtype::Int32(), {}, name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_CONV1x1_QUANTIZEDSYM_MK4_DOT) {
UniformIntRNG rng{-50, 50};
......@@ -2055,6 +2189,41 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM) {
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM_FILTERPREPROCESS) {
NormalRNG rng(128.f);
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false, \
true, true), \
handle(), &rng, epsilon, \
dtype::Quantized8Asymm(1.2f, (uint8_t)125), \
dtype::Quantized8Asymm(1.3f, (uint8_t)129), \
dtype::QuantizedS32(1.2 * 1.3), \
dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); \
check_conv_bias_preprocess( \
get_conv_bias_args({1}, 2, false, false, false, true, true), \
handle(), &rng, epsilon, \
dtype::Quantized8Asymm(1.2f, (uint8_t)125), \
dtype::Quantized8Asymm(1.3f, (uint8_t)129), \
dtype::QuantizedS32(1.2 * 1.3), \
dtype::Quantized8Asymm(50.3f, (uint8_t)120), name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD");
#else
cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8");
#endif
#elif MEGDNN_ARMV7
epsilon = 1;
cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8");
#endif
#undef cb
}
#endif
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
......@@ -2088,6 +2257,39 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32) {
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32_FILTERPREPROCESS) {
UniformIntRNG rng{-50, 50};
float epsilon = 0.001;
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \
handle(), &rng, epsilon, \
dtype::Quantized8Asymm(1.2f, (uint8_t)125), \
dtype::Quantized8Asymm(1.3f, (uint8_t)129), \
dtype::QuantizedS32(1.2 * 1.3), {}, name); \
check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
handle(), &rng, epsilon, \
dtype::Quantized8Asymm(1.2f, (uint8_t)125), \
dtype::Quantized8Asymm(1.3f, (uint8_t)129), \
dtype::QuantizedS32(1.2 * 1.3), {}, name);
#if MEGDNN_AARCH64
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD");
#else
cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8");
#endif
#elif MEGDNN_ARMV7
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH32_QUINT8_K4X8X4");
#endif
cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) {
UniformIntRNG rng{-50, 50};
float epsilon = 0.001;
......@@ -2127,6 +2329,51 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) {
#undef cb
#undef cb_nchw44
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_FILTERPREPROCESS) {
UniformIntRNG rng{-50, 50};
float epsilon = 0.001;
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \
handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \
dtype::Int16{}, dtype::Int16{}, name); \
check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
handle(), &rng, epsilon, dtype::Int8{}, \
dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \
name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X16_K8X8X8");
cb("IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8");
cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X2X16");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_NOPACK_FILTERPREPROCESS) {
UniformIntRNG rng{-50, 50};
float epsilon = 0.001;
#define cb(name) \
check_conv_bias_preprocess( \
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \
handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \
dtype::Int16{}, dtype::Int16{}, name); \
check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \
handle(), &rng, epsilon, dtype::Int8{}, \
dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \
name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16");
#endif
#undef cb
}
#endif
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
......@@ -2147,6 +2394,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16) {
dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, \
name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F16_K8X24X1");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:AARCH32_F16_K4X16X1");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16_FILTERPREPROCESS) {
using namespace conv_bias;
param::ConvBias cur_param;
std::vector<conv_bias::TestArg> args =
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false);
std::vector<conv_bias::TestArg> args1 =
get_conv_bias_args({1}, 2, false, false, false);
args.insert(args.begin(), args1.begin(), args1.end());
NormalRNG rng(1);
#define cb(name) \
check_conv_bias_preprocess(args, handle(), &rng, 0.03, dtype::Float16{}, \
dtype::Float16{}, dtype::Float16{}, \
dtype::Float16{}, name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F16_K8X24X1");
#elif MEGDNN_ARMV7
......@@ -2185,6 +2457,36 @@ void checker_conv_bias_mul_int8x8x32(std::vector<conv_bias::TestArg> args,
}
}
void checker_conv_bias_int8x8x32_preprocess(std::vector<conv_bias::TestArg> args,
Handle* handle, const char* algo_name) {
using namespace conv_bias;
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
checker.set_dtype(0, dtype::Int8());
checker.set_dtype(1, dtype::Int8());
checker.set_dtype(2, dtype::Int32());
checker.set_dtype(4, dtype::Int32());
for (auto&& arg : args) {
checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}});
}
UniformIntRNG rng{-50, 50};
for (auto&& arg : args) {
checker.set_dtype(0, dtype::QuantizedS8(2.5f))
.set_dtype(1, dtype::QuantizedS8(2.5f))
.set_dtype(2, dtype::QuantizedS32(6.25f))
.set_dtype(4, {})
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_param(arg.param)
.execs({arg.src, arg.filter, {}, {}, {}});
}
}
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
#if !__ARM_FEATURE_DOTPROD
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) {
......@@ -2201,6 +2503,20 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2_PREPROCESS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args =
get_nchw44_conv_bias_args({2, 5, 7}, 2, false, true, true);
#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#else
cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args =
......@@ -2216,6 +2532,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1_PREPROCESS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args =
get_nchw44_conv_bias_args({3, 4, 6}, 1, false, true, true);
#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#else
cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2) {
UniformIntRNG rng{-50, 50};
......@@ -2234,6 +2565,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({3, 4, 6}, 2), handle(), &rng, epsilon, \
dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \
dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#else
cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1) {
UniformIntRNG rng{-50, 50};
......@@ -2252,6 +2602,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({2, 5, 7}, 1), handle(), &rng, epsilon, \
dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \
dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#else
cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96");
#endif
#undef cb
}
#if MEGDNN_AARCH64
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE) {
......@@ -2266,6 +2634,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({3}, 1), handle(), &rng, epsilon, \
dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \
dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96");
#undef cb
}
#endif
#endif
#endif
......@@ -2287,6 +2670,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS,
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS,
CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44DOT_FUSE_PREPROCESS) {
UniformIntRNG rng{-50, 50};
#define cb(name) \
check_conv_bias_preprocess( \
get_nchw44_conv_bias_args({3}, 1, false, false, false, false, \
true, false, false, false), \
handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \
dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \
dtype::QuantizedS8(60.25f), name);
float epsilon = 0.001;
cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96");
#undef cb
}
#endif
#endif
......@@ -2320,6 +2720,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args =
get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true);
std::vector<conv_bias::TestArg> args1 =
get_conv_bias_args({1}, 2, false, true, true);
args.insert(args.begin(), args1.begin(), args1.end());
#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name);
#if MEGDNN_AARCH64
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD");
#else
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8");
cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16");
#endif
#elif MEGDNN_ARMV7
#if __ARM_FEATURE_DOTPROD
cb("IM2COLMATMUL:AARCH32_INT8_K6X8X4");
#endif
cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8");
#endif
#if MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X2X16");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
......@@ -2331,25 +2761,62 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) {
#endif
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
{2, 4, 7}, 1, false, false, false, false, false, true,true);
#define cb(name) \
check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \
dtype::Float32(), dtype::Float32(), \
dtype::Float32(), dtype::Float32(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
{3, 5, 6}, 2, false, false, false, false, false, true, true);
#define cb(name) check_conv_bias(args, handle(), name);
#if MEGDNN_AARCH64
check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
#elif MEGDNN_ARMV7
check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
{3}, 2, false, false, false, false, false, true, true, false);
#define cb(name) \
check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \
dtype::Float32(), dtype::Float32(), \
dtype::Float32(), dtype::Float32(), name);
#if MEGDNN_AARCH64
cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
#elif MEGDNN_ARMV7
cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
#endif
#undef cb
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
{3}, 2, false, false, false, false, false, true, true, false);
#define cb(name) check_conv_bias(args, handle(), name);
#if MEGDNN_AARCH64
check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
#elif MEGDNN_ARMV7
check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12");
#endif
#undef cb
}
/***************************** Conv1x1 Algo Test ***********************/
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) {
......
......@@ -1118,6 +1118,30 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args,
}
}
void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
Handle* handle, RNG* rng, float epsilon,
DType type0, DType type1, DType type2,
DType type3, const char* algo_name) {
using namespace conv_bias;
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle);
checker.set_dtype(0, type0);
checker.set_dtype(1, type1);
checker.set_dtype(2, type2);
checker.set_dtype(4, type3);
checker.set_epsilon(epsilon);
if (NULL != rng) {
checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
}
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
for (auto&& arg : args) {
checker.set_param(arg.param).execs(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
param::ConvBias param, Handle* handle,
......
......@@ -58,7 +58,10 @@ std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size);
std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
bool compute_float32 = false,
bool group_mode = false);
void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
Handle* handle, RNG* rng, float epsilon,
DType type0, DType type1, DType type2,
DType type3, const char* algo_name);
template <typename Opr>
using ConvBiasAlgoChecker = AlgoChecker<Opr>;
......
......@@ -752,7 +752,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
}
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) {
using namespace conv_bias;
std::vector<TestArg> args;
......@@ -842,6 +842,98 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
#undef cb2
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p, NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
//! no bias
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
};
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8})
for (size_t p : {0, 2})
for (size_t size : {20, 21, 24})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
//! test OC block
run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle());
UniformIntRNG rng{-50, 50};
#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
checker.set_dtype(0, dtype::Int8()); \
checker.set_dtype(1, dtype::Int8()); \
checker.set_dtype(2, dtype::Int32()); \
checker.set_dtype(4, dtype::Int32()); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
} \
for (auto&& arg : args) { \
checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
.set_dtype(1, dtype::QuantizedS8(2.5f)) \
.set_dtype(2, dtype::QuantizedS32(6.25f)) \
.set_dtype(4, {}) \
.set_rng(0, &rng) \
.set_rng(1, &rng) \
.set_rng(2, &rng) \
.set_param(arg.param) \
.execs({arg.src, arg.filter, {}, {}, {}}); \
}
#define cb2(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
checker.set_dtype(0, dtype::Int8()); \
checker.set_dtype(1, dtype::Int8()); \
checker.set_dtype(2, dtype::Int16()); \
checker.set_dtype(4, dtype::Int16()); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
}
#if MEGDNN_X86_WITH_MKL_DNN
if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
}
#endif
#if MEGDNN_X86_WITH_VNNI
if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
}
#endif
if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2");
}
if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
cb2("IM2COLMATMUL:X86_INT8X8X16_SSE");
}
#undef cb
#undef cb2
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
using namespace conv_bias;
std::vector<TestArg> args;
......@@ -950,6 +1042,61 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) {
#undef cb
}
TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p, NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
//! no bias
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, 1, 1});
args.emplace_back(
param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
(w + 2 * p - kernel) / param.stride_w + 1});
};
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8, 16, 300})
for (size_t p : {0, 2})
for (size_t size : {8, 24})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY, NonlineMode::RELU}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle());
#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs( \
{arg.src, arg.filter, arg.bias, {}, {}}); \
}
cb("IM2COLMATMUL:X86_F32_BLAS");
#undef cb
}
#endif
......@@ -1020,6 +1167,73 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
#undef cb
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p, NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
//! no bias
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, 1, 1});
args.emplace_back(
param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
(w + 2 * p - kernel) / param.stride_w + 1});
param.sparse = param::ConvBias::Sparse::GROUP;
args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
TensorShape{2, oc, ic, kernel, kernel},
TensorShape{});
args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
TensorShape{2, oc, ic, kernel, kernel},
TensorShape{1, oc * 2, 1, 1});
args.emplace_back(
param, TensorShape{1, 2 * ic, h, w},
TensorShape{2, oc, ic, kernel, kernel},
TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
(w + 2 * param.pad_w - kernel) / 1 + 1});
};
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8, 16})
for (size_t p : {0, 1})
for (size_t size : {8, 24})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY, NonlineMode::RELU}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle());
#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs( \
{arg.src, arg.filter, arg.bias, {}, {}}); \
}
cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
#undef cb
}
/**************************** Conv1x1 PackA *************************/
namespace {
void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
......@@ -1169,6 +1383,77 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
#undef cb
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p, NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
//! no bias
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
//! bias channel
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, 1, 1});
};
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8})
for (size_t p : {0, 2})
for (size_t size : {20, 21, 24})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY, NonlineMode::RELU,
NonlineMode::H_SWISH}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle());
#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
UniformIntRNG rng{-50, 50}; \
for (auto&& arg : args) { \
checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
.set_dtype(1, dtype::QuantizedS8(2.5f)) \
.set_dtype(2, dtype::QuantizedS32(6.25f)) \
.set_dtype(4, dtype::QuantizedS8(60.25)) \
.set_rng(0, &rng) \
.set_rng(1, &rng) \
.set_rng(2, &rng) \
.set_param(arg.param) \
.execs({arg.src, arg.filter, {}, {}, {}}); \
}
#if MEGDNN_X86_WITH_MKL_DNN
if (x86::is_supported(x86::SIMDType::VNNI)) {
cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
}
#endif
#if MEGDNN_X86_WITH_VNNI
if (x86::is_supported(x86::SIMDType::VNNI)) {
cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
}
#endif
if (x86::is_supported(x86::SIMDType::AVX2)) {
cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
}
#undef cb
}
TEST_F(X86, CONV_BIAS_MATMUL) {
using namespace conv_bias;
std::vector<TestArg> args;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册