提交 c4dfdbd2 编写于 作者: M Megvii Engine Team 提交者: Xinran Xu

refactor(dnn/fallback): refactor im2col

GitOrigin-RevId: b58770211e33c68267b2ed32811872bb7ee0696c
上级 86a3445e
......@@ -67,8 +67,7 @@ public:
}
auto&& fm = param.filter_meta;
auto OC = fm.ocpg, IC = fm.icpg;
return (fm.spatial[0] == fm.spatial[1] && fm.spatial[0] == 1) ||
OC >= 32 || IC >= 32;
return OC >= 32 || IC >= 32;
}
private:
......
此差异已折叠。
/**
* \file dnn/src/fallback/conv_bias/im2col/strategy_base.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/fallback/conv_bias/opr_impl.h"
namespace megdnn {
using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
struct StrategyParam {
size_t batch_id;
size_t group_id;
size_t oc_tile_size;
size_t oc_cur_index;
size_t oc_end_index;
size_t ohw_cur_index;
size_t output_block_size;
size_t output_block_oc_size;
size_t ohw;
size_t block_m;
size_t block_n;
size_t block_k;
bool skip_copy_dst;
bool is_dst_8bit;
bool is_ohw_size_bigger;
};
class StrategyBase {
public:
StrategyBase() = default;
virtual ~StrategyBase() = default;
virtual void copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual void packA_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual void exec_im2col(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo) = 0;
virtual void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual void exec_postprocess(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0;
};
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode, PackMode packmode>
class Strategy;
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::DEFAULT> : public StrategyBase {
public:
constexpr static size_t BUNDLE_PADDING_INDEX = 0;
constexpr static size_t BUNDLE_PACKA_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0;
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2;
Strategy();
void copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void packA_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override;
void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam);
void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam);
void* get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread);
void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam);
};
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::NO_PACK> : public StrategyBase {
public:
constexpr static size_t BUNDLE_PADDING_INDEX = 0;
constexpr static size_t BUNDLE_PACKA_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 0;
constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2;
constexpr static size_t THREAD_BUNDLE_MATCOMP_INDEX = 3;
Strategy();
void copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void packA_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam);
inline void* get_bias_temp_ptr(
const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread) {
bias_ctype* bias_tmp_ptr =
param.bias_mode == megdnn::BiasMode::BIAS
? static_cast<bias_ctype*>(
bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX))
: nullptr;
return bias_tmp_ptr;
}
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override;
void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam);
void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam);
};
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::ONLY_PACKA> : public StrategyBase {
public:
constexpr static size_t BUNDLE_PADDING_INDEX = 0;
constexpr static size_t BUNDLE_PACKA_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0;
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 2;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 3;
Strategy();
void copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void packA_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override;
void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam);
inline void* get_bias_temp_ptr(
const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread) {
bias_ctype* bias_tmp_ptr =
param.bias_mode == megdnn::BiasMode::BIAS
? static_cast<bias_ctype*>(
bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX))
: nullptr;
return bias_tmp_ptr;
}
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override;
void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam);
void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam);
};
} // namespace megdnn
/**
* \file dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/opr_param_defs.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/im2col/strategy_base.h"
#include "src/fallback/convolution/img2col_helper.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#endif
using namespace megdnn;
#if MEGDNN_X86
using namespace x86;
#endif
namespace megdnn {
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::Strategy()
: StrategyBase() {}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(N);
MEGDNN_MARK_USED_VAR(OC);
MEGDNN_MARK_USED_VAR(OH);
MEGDNN_MARK_USED_VAR(OW);
MEGDNN_MARK_USED_VAR(FH);
MEGDNN_MARK_USED_VAR(FW);
MEGDNN_MARK_USED_VAR(SH);
MEGDNN_MARK_USED_VAR(SW);
size_t IW2 = IW + 2 * PW;
size_t IH2 = IH + 2 * PH;
size_t batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1];
size_t channel_id = ncb_index.ndrange_id[2];
size_t padding_group_size = IH2 * IW2 * IC;
size_t workspace_channel_offset = IH2 * IW2 * channel_id;
size_t workspace_group_offset = group_id * padding_group_size;
size_t workspace_batch_offset =
param.filter_meta.group * batch_id * padding_group_size;
bundle.set(param.workspace_ptr);
src_ctype src_zp = static_cast<src_ctype>(0);
if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) {
src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point;
}
src_ctype* src = const_cast<src_ctype*>(
param.src<src_ctype>(batch_id, group_id, channel_id));
src_ctype* src2;
src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) +
workspace_group_offset + workspace_batch_offset +
workspace_channel_offset;
src_ctype* src2_ptr = src2;
const src_ctype* src_ptr = src;
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
rep(ih, IH) {
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW);
src2_ptr += IW;
src_ptr += IW;
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
}
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
packA_kern(WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
bundle.set(param.workspace_ptr);
fallback::MatrixMulImpl::KernParam matmul_param;
size_t group_id = ncb_index.ndrange_id[0];
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmulparam;
size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
size_t packed_per_oc_block_size =
round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) *
matmul_algo->get_inner_block_size().m *
matmul_algo->get_packA_type_size();
size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size;
int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
group_id * packA_group_size + a_panel_offset;
matmul_param.A_ptr =
const_cast<src_ctype*>(param.filter<src_ctype>(group_id));
matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1],
matmul_algo->get_inner_block_size().m);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo
) {
size_t m_sh = param.filter_meta.stride[0];
size_t m_sw = param.filter_meta.stride[1];
size_t m_oc = param.filter_meta.ocpg;
size_t m_oh = param.osz[0];
size_t m_ow = param.osz[1];
size_t m_ic = param.filter_meta.icpg;
size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2;
size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2;
size_t m_fh = param.filter_meta.spatial[0];
size_t m_fw = param.filter_meta.spatial[1];
size_t m_is_xcorr = !param.filter_meta.should_flip;
size_t input_offset =
m_ih * m_iw * m_ic *
(sparam.group_id + param.filter_meta.group * sparam.batch_id) *
sizeof(src_ctype);
src_ctype* src2 = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) +
input_offset);
bool is_phpwzero = param.filter_meta.padding[0] == 0 &&
param.filter_meta.padding[1] == 0;
if (is_phpwzero) {
src2 = const_cast<src_ctype*>(
param.src<src_ctype>(sparam.batch_id, sparam.group_id));
}
src_ctype* im2col_dst = static_cast<src_ctype*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
if (m_sh == 1 && m_sw == 1) {
if (m_is_xcorr) {
img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
}
} else {
if (m_is_xcorr) {
img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih,
m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic,
m_ih, m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
}
}
matmul_param.M = sparam.output_block_oc_size;
matmul_param.N = sparam.output_block_size;
matmul_param.LDB = sparam.output_block_size;
matmul_param.LDC = sparam.output_block_size;
matmul_param.B_ptr = im2col_dst;
src_ctype* b_panel =
reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>(
bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX)));
matmul_algo->pack_B(matmul_param, b_panel, 0, matmul_param.N);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam) {
if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) {
return static_cast<void*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
} else {
bias_ctype* dst =
param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw;
return static_cast<void*>(dst);
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
size_t packA_per_oc_block_size =
round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) *
sparam.oc_tile_size * matmul_algo->get_packA_type_size();
size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size +
ncb_index.ndrange_id[3] * packA_per_oc_block_size;
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
src_ctype* a_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
a_panel_offset);
src_ctype* b_panel =
reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>(
bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX)));
matmul_param.M = sparam.output_block_oc_size;
matmul_param.N = sparam.output_block_size;
matmul_param.LDB = sparam.output_block_size;
matmul_param.LDC = sparam.output_block_size;
matmul_param.C_ptr = matmul_dst;
auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param);
matmul_kern_naked(matmul_param, a_panel, b_panel);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) {
copy_bias(param, bundle_thread, sparam);
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
void* bias_temp_ptr = get_bias_temp_ptr(param, bundle_thread);
void* bias_preprocess_ptr = const_cast<void*>(
param.bias_mode == megdnn::BiasMode::BIAS
? bias_temp_ptr
: static_cast<void*>(const_cast<bias_ctype*>(
bias_ptr + sparam.oc_cur_index)));
PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
matmul_dst, bias_preprocess_ptr, matmul_dst, param.bias_mode,
param.nonlineMode, param.bias_type, param.dst_type, 1_z,
sparam.output_block_oc_size, 1_z, sparam.output_block_size);
copy_dst(param, matmul_dst, sparam);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam) {
if (!sparam.skip_copy_dst) {
dst_ctype* dst_tmp_ptr =
reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst));
dst_ctype* dst =
param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index;
for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) {
std::memcpy(dst, dst_tmp_ptr,
sizeof(dst_ctype) * sparam.output_block_size);
dst_tmp_ptr += sparam.output_block_size;
dst += sparam.ohw;
}
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread) {
bias_ctype* bias_tmp_ptr =
param.bias_mode == megdnn::BiasMode::BIAS
? static_cast<bias_ctype*>(
bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX))
: nullptr;
return bias_tmp_ptr;
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::DEFAULT>::
copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam) {
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
bias_ctype* bias_temp_ptr =
static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread));
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_ctype* copy_dst = bias_temp_ptr;
const bias_ctype* copy_src = bias_ptr +
sparam.oc_cur_index * sparam.ohw +
sparam.ohw_cur_index;
for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) {
std::memcpy(copy_dst, copy_src,
sizeof(bias_ctype) * sparam.output_block_size);
copy_dst += sparam.output_block_size;
copy_src += sparam.ohw;
}
}
}
#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \
_op_dtype, _postprocess_mode) \
template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \
_op_dtype, _postprocess_mode, PackMode::DEFAULT>;
INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
megdnn::PostprocessMode::FLOAT)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
megdnn::PostprocessMode::FLOAT)
#else
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif
#endif
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
//! x86 do not have uint8 matmul so only armv7 armv8 support uint8
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
#endif
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
#undef INSTANTIAL_CLASS
} // namespace megdnn
/**
* \file dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/opr_param_defs.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/im2col/strategy_base.h"
#include "src/fallback/convolution/img2col_helper.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#endif
using namespace megdnn;
#if MEGDNN_X86
using namespace x86;
#endif
namespace megdnn {
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::Strategy()
: StrategyBase() {}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(N);
MEGDNN_MARK_USED_VAR(OC);
MEGDNN_MARK_USED_VAR(OH);
MEGDNN_MARK_USED_VAR(OW);
MEGDNN_MARK_USED_VAR(FH);
MEGDNN_MARK_USED_VAR(FW);
MEGDNN_MARK_USED_VAR(SH);
MEGDNN_MARK_USED_VAR(SW);
size_t IW2 = IW + 2 * PW;
size_t IH2 = IH + 2 * PH;
size_t batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1];
size_t channel_id = ncb_index.ndrange_id[2];
size_t padding_group_size = IH2 * IW2 * IC;
size_t workspace_channel_offset = IH2 * IW2 * channel_id;
size_t workspace_group_offset = group_id * padding_group_size;
size_t workspace_batch_offset =
param.filter_meta.group * batch_id * padding_group_size;
bundle.set(param.workspace_ptr);
src_ctype src_zp = static_cast<src_ctype>(0);
if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) {
src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point;
}
src_ctype* src = const_cast<src_ctype*>(
param.src<src_ctype>(batch_id, group_id, channel_id));
src_ctype* src2;
src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) +
workspace_group_offset + workspace_batch_offset +
workspace_channel_offset;
src_ctype* src2_ptr = src2;
const src_ctype* src_ptr = src;
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
rep(ih, IH) {
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW);
src2_ptr += IW;
src_ptr += IW;
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
}
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
packA_kern(WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
MEGDNN_MARK_USED_VAR(bundle);
MEGDNN_MARK_USED_VAR(param);
MEGDNN_MARK_USED_VAR(matmulparam);
MEGDNN_MARK_USED_VAR(matmul_algo);
MEGDNN_MARK_USED_VAR(ncb_index);
megdnn_throw(
"nopack mode should not call packA_kern please check your code");
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam) {
if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) {
return static_cast<bias_ctype*>(
bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX));
} else {
bias_ctype* dst =
param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw;
return static_cast<void*>(dst);
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
MEGDNN_MARK_USED_VAR(bundle);
MEGDNN_MARK_USED_VAR(ncb_index);
matmul_param.workspace_ptr = bundle_thread.get(THREAD_BUNDLE_MATCOMP_INDEX);
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
src_ctype* im2col_dst = static_cast<src_ctype*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
const void* filter = param.filter<src_ctype>(sparam.group_id) +
sparam.oc_cur_index * param.filter_meta.icpg *
param.filter_meta.spatial[0] *
param.filter_meta.spatial[1];
matmul_param.M = sparam.output_block_oc_size;
matmul_param.N = sparam.output_block_size;
matmul_param.LDB = sparam.output_block_size;
matmul_param.LDC = sparam.output_block_size;
matmul_param.A_ptr = filter;
matmul_param.B_ptr = im2col_dst;
matmul_param.C_ptr = matmul_dst;
auto matmul_kern = matmul_algo->get_kern(matmul_param);
matmul_kern(matmul_param);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo
) {
MEGDNN_MARK_USED_VAR(matmul_param);
MEGDNN_MARK_USED_VAR(matmul_algo);
size_t m_sh = param.filter_meta.stride[0];
size_t m_sw = param.filter_meta.stride[1];
size_t m_oc = param.filter_meta.ocpg;
size_t m_oh = param.osz[0];
size_t m_ow = param.osz[1];
size_t m_ic = param.filter_meta.icpg;
size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2;
size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2;
size_t m_fh = param.filter_meta.spatial[0];
size_t m_fw = param.filter_meta.spatial[1];
size_t m_is_xcorr = !param.filter_meta.should_flip;
size_t input_offset =
m_ih * m_iw * m_ic *
(sparam.group_id + param.filter_meta.group * sparam.batch_id) *
sizeof(src_ctype);
src_ctype* src2 = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) +
input_offset);
bool is_phpwzero = param.filter_meta.padding[0] == 0 &&
param.filter_meta.padding[1] == 0;
if (is_phpwzero) {
src2 = const_cast<src_ctype*>(
param.src<src_ctype>(sparam.batch_id, sparam.group_id));
}
src_ctype* im2col_dst = static_cast<src_ctype*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
if (m_sh == 1 && m_sw == 1) {
if (m_is_xcorr) {
img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
}
} else {
if (m_is_xcorr) {
img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih,
m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic,
m_ih, m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
}
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) {
copy_bias(param, bundle_thread, sparam);
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
bias_ctype* bias_temp_ptr =
static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread));
PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
matmul_dst,
const_cast<void*>(
param.bias_mode == megdnn::BiasMode::BIAS
? bias_temp_ptr
: static_cast<void*>(const_cast<bias_ctype*>(
bias_ptr + sparam.oc_cur_index))),
matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type,
param.dst_type, 1_z, sparam.output_block_oc_size, 1_z,
sparam.output_block_size);
copy_dst(param, matmul_dst, sparam);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam) {
if (!sparam.skip_copy_dst) {
dst_ctype* dst_tmp_ptr =
reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst));
dst_ctype* dst =
param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index;
for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) {
std::memcpy(dst, dst_tmp_ptr,
sizeof(dst_ctype) * sparam.output_block_size);
dst_tmp_ptr += sparam.output_block_size;
dst += sparam.ohw;
}
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::NO_PACK>::
copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam) {
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
bias_ctype* bias_temp_ptr =
static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread));
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_ctype* copy_dst = bias_temp_ptr;
const bias_ctype* copy_src = bias_ptr +
sparam.oc_cur_index * sparam.ohw +
sparam.ohw_cur_index;
for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) {
std::memcpy(copy_dst, copy_src,
sizeof(bias_ctype) * sparam.output_block_size);
copy_dst += sparam.output_block_size;
copy_src += sparam.ohw;
}
}
}
#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \
_op_dtype, _postprocess_mode) \
template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \
_op_dtype, _postprocess_mode, PackMode::NO_PACK>;
INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
megdnn::PostprocessMode::FLOAT)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
megdnn::PostprocessMode::FLOAT)
#else
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif
#endif
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
//! x86 do not have uint8 matmul so only armv7 armv8 support uint8
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
#endif
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
} // namespace megdnn
/**
* \file dnn/src/fallback/conv_bias/im2col/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/opr_param_defs.h"
#include "src/fallback/conv_bias/im2col/strategy_base.h"
#include "src/fallback/convolution/img2col_helper.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#endif
using namespace megdnn;
#if MEGDNN_X86
using namespace x86;
#endif
namespace megdnn {
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::Strategy()
: StrategyBase() {}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
copy_padding_kern(
WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(N);
MEGDNN_MARK_USED_VAR(OC);
MEGDNN_MARK_USED_VAR(OH);
MEGDNN_MARK_USED_VAR(OW);
MEGDNN_MARK_USED_VAR(FH);
MEGDNN_MARK_USED_VAR(FW);
MEGDNN_MARK_USED_VAR(SH);
MEGDNN_MARK_USED_VAR(SW);
size_t IW2 = IW + 2 * PW;
size_t IH2 = IH + 2 * PH;
size_t batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1];
size_t channel_id = ncb_index.ndrange_id[2];
size_t padding_group_size = IH2 * IW2 * IC;
size_t workspace_channel_offset = IH2 * IW2 * channel_id;
size_t workspace_group_offset = group_id * padding_group_size;
size_t workspace_batch_offset =
param.filter_meta.group * batch_id * padding_group_size;
bundle.set(param.workspace_ptr);
src_ctype src_zp = static_cast<src_ctype>(0);
if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) {
src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point;
}
src_ctype* src = const_cast<src_ctype*>(
param.src<src_ctype>(batch_id, group_id, channel_id));
src_ctype* src2;
src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) +
workspace_group_offset + workspace_batch_offset +
workspace_channel_offset;
src_ctype* src2_ptr = src2;
const src_ctype* src_ptr = src;
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
rep(ih, IH) {
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW);
src2_ptr += IW;
src_ptr += IW;
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = src_zp;
}
if (PH != 0) {
std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
src2_ptr += PH * IW2;
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
packA_kern(WorkspaceBundle bundle,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
bundle.set(param.workspace_ptr);
fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmulparam;
size_t OC = param.filter_meta.ocpg;
size_t oc_tile_size = matmul_param.M;
size_t group_id = ncb_index.ndrange_id[0];
size_t output_block_oc_size =
std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size);
size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size;
size_t packA_group_size =
bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
size_t a_panel_offset = ncb_index.ndrange_id[1] *
matmul_algo->get_bundle(matmul_param).get_size(0);
int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
group_id * packA_group_size + a_panel_offset;
matmul_param.A_ptr =
const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) +
oc_cur_index * matmul_param.K;
matmul_param.M = output_block_oc_size;
matmul_algo->pack_A(matmul_param, a_panel, 0_z, 0_z);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam) {
if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) {
return static_cast<void*>(
bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX));
} else {
bias_ctype* dst =
param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw;
return static_cast<void*>(dst);
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) {
size_t packA_group_size =
bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
size_t a_panel_offset = ncb_index.ndrange_id[3] *
matmul_algo->get_bundle(matmul_param).get_size(0);
a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset;
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
src_ctype* a_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
a_panel_offset);
src_ctype* b_panel = nullptr;
src_ctype* im2col_dst = static_cast<src_ctype*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
matmul_param.M = sparam.output_block_oc_size;
matmul_param.N = sparam.output_block_size;
matmul_param.LDB = sparam.output_block_size;
matmul_param.LDC = sparam.output_block_size;
matmul_param.B_ptr = im2col_dst;
matmul_param.C_ptr = matmul_dst;
auto matmul_kern = matmul_algo->get_kern_naked(matmul_param);
matmul_kern(matmul_param, a_panel, b_panel);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
fallback::MatrixMulImpl::AlgoBase* matmul_algo
) {
MEGDNN_MARK_USED_VAR(matmul_param);
MEGDNN_MARK_USED_VAR(matmul_algo);
size_t m_sh = param.filter_meta.stride[0];
size_t m_sw = param.filter_meta.stride[1];
size_t m_oc = param.filter_meta.ocpg;
size_t m_oh = param.osz[0];
size_t m_ow = param.osz[1];
size_t m_ic = param.filter_meta.icpg;
size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2;
size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2;
size_t m_fh = param.filter_meta.spatial[0];
size_t m_fw = param.filter_meta.spatial[1];
size_t m_is_xcorr = !param.filter_meta.should_flip;
size_t input_offset =
m_ih * m_iw * m_ic *
(sparam.group_id + param.filter_meta.group * sparam.batch_id) *
sizeof(src_ctype);
src_ctype* src2 = reinterpret_cast<src_ctype*>(
reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) +
input_offset);
bool is_phpwzero = param.filter_meta.padding[0] == 0 &&
param.filter_meta.padding[1] == 0;
if (is_phpwzero) {
src2 = const_cast<src_ctype*>(
param.src<src_ctype>(sparam.batch_id, sparam.group_id));
}
src_ctype* im2col_dst = static_cast<src_ctype*>(
bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX));
if (m_sh == 1 && m_sw == 1) {
if (m_is_xcorr) {
img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw,
m_fh, m_fw, sparam.ohw_cur_index,
sparam.output_block_size);
}
} else {
if (m_is_xcorr) {
img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih,
m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
} else {
img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic,
m_ih, m_iw, m_fh, m_fw, m_sh, m_sw,
sparam.ohw_cur_index,
sparam.output_block_size);
}
}
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam,
WorkspaceBundle bundle_thread) {
void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
bias_ctype* bias_temp_ptr =
static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread));
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_ctype* copy_dst = bias_temp_ptr;
const bias_ctype* copy_src = bias_ptr +
sparam.oc_cur_index * sparam.ohw +
sparam.ohw_cur_index;
for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) {
std::memcpy(copy_dst, copy_src,
sizeof(bias_ctype) * sparam.output_block_size);
copy_dst += sparam.output_block_size;
copy_src += sparam.ohw;
}
}
PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
matmul_dst,
const_cast<void*>(
param.bias_mode == megdnn::BiasMode::BIAS
? bias_temp_ptr
: static_cast<void*>(const_cast<bias_ctype*>(
bias_ptr + sparam.oc_cur_index))),
matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type,
param.dst_type, 1_z, sparam.output_block_oc_size, 1_z,
sparam.output_block_size);
copy_dst(param, matmul_dst, sparam);
}
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode,PackMode::ONLY_PACKA>::
copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,
const void* matmul_dst, const StrategyParam& sparam) {
if (!sparam.skip_copy_dst) {
dst_ctype* dst_tmp_ptr =
reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst));
dst_ctype* dst =
param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) +
sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index;
for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) {
std::memcpy(dst, dst_tmp_ptr,
sizeof(dst_ctype) * sparam.output_block_size);
dst_tmp_ptr += sparam.output_block_size;
dst += sparam.ohw;
}
}
}
#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \
_op_dtype, _postprocess_mode) \
template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, \
_op_ctype, _op_dtype, _postprocess_mode,PackMode::ONLY_PACKA>;
INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
megdnn::PostprocessMode::FLOAT)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
megdnn::PostprocessMode::FLOAT)
#else
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif
#endif
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
//! x86 do not have uint8 matmul so only armv7 armv8 support uint8
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
#endif
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8,
megdnn::PostprocessMode::QUANTIZED)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
megdnn::PostprocessMode::NO_PROCESS)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32,
megdnn::PostprocessMode::NO_PROCESS)
#undef INSTANTIAL_CLASS
} // namespace megdnn
......@@ -8,7 +8,6 @@
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include <cstddef>
#include "src/common/utils.h"
namespace {
......@@ -42,7 +41,8 @@ void img2col_stride(const dtype* __restrict src, dtype* __restrict dst,
}
}
//! add for im2col matmul multithread
//!add for im2col matmul multithread
template <bool is_xcorr, typename dtype>
void img2col_stride(const dtype* __restrict src, dtype* __restrict dst,
const int OC, const int OH, const int OW, const int IC,
......
......@@ -323,6 +323,7 @@ struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_qint8>
init(src_scale, dst_scale);
}
};
template <>
struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8>
: OpBase<dt_qint32, dt_quint8> {
......@@ -330,20 +331,24 @@ struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8>
using src_ctype = dt_qint32;
using dst_ctype = dt_quint8;
float scale, scale_src, scale_dst;
void init(float src_scale, float dst_scale) {
uint8_t dzp;
void init(float src_scale, float dst_scale, uint8_t dst_zp) {
scale_src = src_scale;
scale_dst = 1.f / dst_scale;
scale_dst = 1.0f / dst_scale;
dzp = dst_zp;
scale = src_scale / dst_scale;
}
UnaryOpBase(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS32>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
init(src_scale, dst_scale);
float dst_scale = dst_dtype.param<dtype::Quantized8Asymm>().scale;
uint8_t dst_zp = dst_dtype.param<dtype::Quantized8Asymm>().zero_point;
init(src_scale, dst_scale, dst_zp);
}
UnaryOpBase(float src_scale, float dst_scale) {
init(src_scale, dst_scale);
UnaryOpBase(float src_scale, float dst_scale, uint8_t dst_zp) {
init(src_scale, dst_scale, dst_zp);
}
};
#define OP_BASE(_simd_type, _simd_target, _simd_data_type, _func_prefix) \
template <> \
struct UnaryOpBase<_simd_type, dt_float32, dt_qint8> \
......@@ -828,7 +833,6 @@ template <typename Op>
struct UnaryQuantizationOp<SIMDType::NONE, dt_qint32, dt_quint8, Op>
: UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8> {
using UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8>::UnaryOpBase;
constexpr static size_t SIMD_WIDTH = 8;
Op op;
void operator()(const dt_qint32& src, dt_quint8* dst) const {
......
......@@ -195,10 +195,10 @@ MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32Vnni::get_kern(
return int8x8x32_kern_vnni;
}
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(AlgoInt8x8x32Vnni,
megdnn_x86_matmul_kern, 5,
x86::matmul::gemm_int8_vnni_12x32x4,
dt_int8, dt_int32, dt_uint8);
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(AlgoInt8x8x32Vnni,
megdnn_x86_matmul_kern, 5,
x86::matmul::gemm_int8_vnni_12x32x4,
dt_int8, dt_int32, dt_uint8);
#endif
/* ===================== Int8 mkldnn algo ===================== */
......@@ -364,7 +364,9 @@ size_t MatrixMulImpl::AlgoInt8x8x32AVX2M4N16K2::get_workspace(
m, n, k, trans_a, trans_b, strategy, cacheline)
.get_workspace_size();
}
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
AlgoInt8x8x32AVX2M4N16K2, megdnn_x86_matmul_kern, 8,
x86::matmul::gemm_avx2_s8s8s32_4x16x2, dt_int8, dt_int32, dt_int16);
MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32AVX2M2N4K16::get_kern(
const KernSizeParam&) const {
......@@ -437,6 +439,10 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace(
.get_workspace_size();
}
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9,
x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16);
/*************************AlgoF32MK8_8x8********************/
MatrixMulImpl::kern_t MatrixMulImpl::AlgoF32MK8_8x8::get_kern(
const KernSizeParam&) const {
......
......@@ -68,7 +68,7 @@ public:
size_t get_workspace(const KernSizeParam&) const override;
kern_t get_kern(const KernSizeParam&) const override;
void* type() const override { return sm_x86_algo_type; }
PackMode packmode() const override { return PackMode::NO_PACK; }
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL();
};
class MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2 : public AlgoBase {
......@@ -79,7 +79,7 @@ public:
size_t get_workspace(const KernSizeParam&) const override;
kern_t get_kern(const KernSizeParam&) const override;
void* type() const override { return sm_x86_algo_type; }
PackMode packmode() const override { return PackMode::NO_PACK; }
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL();
};
class MatrixMulImpl::AlgoF32MK8_8x8 : public AlgoBase {
......
......@@ -741,7 +741,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
};
for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8})
for (size_t p : {0, 2})
......@@ -751,7 +751,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
//! test OC block
run(2046, 1, 8, 8, 1, 0, NonlineMode::IDENTITY);
run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
Checker<ConvBias> checker(handle());
UniformIntRNG rng{-50, 50};
......@@ -826,7 +826,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
(w + 2 * p - kernel) / param.stride_w + 1});
};
for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8, 16, 300})
for (size_t p : {0, 2})
......@@ -895,7 +895,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
(w + 2 * param.pad_w - kernel) / 1 + 1});
};
for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8, 16})
for (size_t p : {0, 1})
......@@ -945,7 +945,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
TensorShape{1, oc, 1, 1});
};
for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8})
for (size_t p : {0, 2})
......@@ -2183,7 +2183,7 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
dtype::Int32(), dtype::Int32()};
std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2";
std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192";
// std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16";
// printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n");
benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册