提交 217999b1 编写于 作者: M Megvii Engine Team

feat(arm): add winograd F43 NCHW44 algo and winograd F43 44 algo

GitOrigin-RevId: a981b2f61b123d6d5b476e8316c8bb83f0367647
上级 f0f6f5fe
...@@ -177,6 +177,15 @@ ...@@ -177,6 +177,15 @@
UNROLL_RAW_5x2(cb, v0, ##a) \ UNROLL_RAW_5x2(cb, v0, ##a) \
cb(5, 0, ##a) cb(5, 1, ##a) cb(5, 0, ##a) cb(5, 1, ##a)
#define UNROLL_RAW_4x6(cb, v0, a...) \
cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) cb(0, 4, ##a) cb(0, 5, ##a) \
cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) cb(1, 4, ##a) cb(1, 5, ##a) \
cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) cb(2, 4, ##a) cb(2, 5, ##a) \
cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) cb(3, 4, ##a) cb(3, 5, ##a)
#define UNROLL_RAW_5x6(cb, v0, a...) \
UNROLL_RAW_4x6(cb, v0, ##a) \
cb(4, 0, ##a) cb(4, 1, ##a) cb(4, 2, ##a) cb(4, 3, ##a) cb(4, 4, ##a) cb(4, 5, ##a)
#define UNROLL_CALL0_D2(step, step2, cb, v...) \ #define UNROLL_CALL0_D2(step, step2, cb, v...) \
UNROLL_RAW_##step##x##step2(cb, 0, ##v) UNROLL_RAW_##step##x##step2(cb, 0, ##v)
#define UNROLL_CALL1_D2(step, step2, cb, v...) \ #define UNROLL_CALL1_D2(step, step2, cb, v...) \
......
...@@ -218,6 +218,44 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( ...@@ -218,6 +218,44 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4, winograd::winograd_6x3_4x4_f, AlgoFP32WinogradF63_4x4, winograd::winograd_6x3_4x4_f,
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* ======================= AlgoFP32WinogradF43_4x4 ======================== */
bool ConvBiasImpl::AlgoFP32WinogradF43_4x4::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 6, 0) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
using Strategy = winograd::winograd_4x3_4x4_f;
using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
megdnn::winograd::ConvBias<Strategy, param::MatrixMul::Format::MK4>(
strategy, m_tile_size, param)
.get_matmul_kern_param(param);
return m_matmul_algo->usable(matmul_param) &&
m_matmul_algo->packmode() == PackMode::NO_PACK &&
param.filter_meta.format == param::ConvBias::Format::NCHW &&
!param.filter_meta.should_flip &&
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
param.filter_meta.spatial[0] == 3) &&
(param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
param.filter_meta.stride[0] == 1) &&
(param.filter_meta.dilation[0] == param.filter_meta.dilation[1] &&
param.filter_meta.dilation[0] == 1) &&
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
param.src_type.enumv() == DTypeEnum::Float32 &&
param.filter_meta.icpg % 4 == 0 && param.filter_meta.ocpg % 4 == 0;
}
MIDOUT_END();
return false;
}
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF43_4x4, winograd::winograd_4x3_4x4_f,
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ /* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */
bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
...@@ -297,6 +335,46 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( ...@@ -297,6 +335,46 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4_NCHW44, winograd::winograd_F63_mk4_f_nchw44, AlgoFP32WinogradF63_4x4_NCHW44, winograd::winograd_F63_mk4_f_nchw44,
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF43_4x4_NCHW44 ===================== */
bool ConvBiasImpl::AlgoFP32WinogradF43_4x4_NCHW44::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(
megdnn_fallback_winograd_fp32,
midout_iv("AlgoFP32WinogradF43_4x4_NCHW44"_hash)) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
using Strategy = winograd::winograd_F43_mk4_f_nchw44;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
megdnn::winograd::ConvBias<Strategy, param::MatrixMul::Format::MK4>(
strategy, m_tile_size, param)
.get_matmul_kern_param(param);
return m_matmul_algo->usable(matmul_param) &&
m_matmul_algo->packmode() ==
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
param.filter_meta.format == param::ConvBias::Format::NCHW44 &&
!param.filter_meta.should_flip &&
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
param.filter_meta.spatial[0] == 3) &&
(param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
param.filter_meta.stride[0] == 1) &&
(param.filter_meta.dilation[0] == param.filter_meta.dilation[1] &&
param.filter_meta.dilation[0] == 1) &&
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
param.src_type.enumv() == DTypeEnum::Float32 &&
param.filter_meta.icpg % 4 == 0 && param.filter_meta.ocpg % 4 == 0;
}
MIDOUT_END();
return false;
}
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF43_4x4_NCHW44, winograd::winograd_F43_mk4_f_nchw44,
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */ /* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */
bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable( bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable(
......
...@@ -81,6 +81,23 @@ public: ...@@ -81,6 +81,23 @@ public:
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_FP32) MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_FP32)
}; };
class ConvBiasImpl::AlgoFP32WinogradF43_4x4 final : public AlgoBase {
public:
AlgoFP32WinogradF43_4x4(
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size)
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {}
const char* name() const override {
if (m_name.empty()) {
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>(
m_matmul_algo->name(), {4, 4, m_tile_size, 3});
}
return m_name.c_str();
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F43_4X4_FP32)
};
class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase { class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase {
public: public:
AlgoFP32WinogradF54( AlgoFP32WinogradF54(
...@@ -156,6 +173,24 @@ public: ...@@ -156,6 +173,24 @@ public:
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32) MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32)
}; };
class ConvBiasImpl::AlgoFP32WinogradF43_4x4_NCHW44 final : public AlgoBase {
public:
AlgoFP32WinogradF43_4x4_NCHW44(
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size)
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {}
const char* name() const override {
if (m_name.empty()) {
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>(
m_matmul_algo->name(), {4, 4, m_tile_size, 3},
param::ConvBias::Format::NCHW44);
}
return m_name.c_str();
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32)
};
class ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44 final : public AlgoBase { class ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44 final : public AlgoBase {
public: public:
AlgoFP32WinogradF73_4x4_NCHW44( AlgoFP32WinogradF73_4x4_NCHW44(
......
...@@ -16,6 +16,8 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 1, 1, winograd_4x ...@@ -16,6 +16,8 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 1, 1, winograd_4x
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, winograd_6x3_4x4_f) MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, winograd_6x3_4x4_f)
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 4, 4, winograd_4x3_4x4_f)
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, winograd_5x4_1x1_f) MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, winograd_5x4_1x1_f)
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, winograd_4x5_1x1_f) MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, winograd_4x5_1x1_f)
...@@ -26,6 +28,9 @@ MEGDNN_REG_WINOGRAD_STRATEGY( ...@@ -26,6 +28,9 @@ MEGDNN_REG_WINOGRAD_STRATEGY(
MEGDNN_REG_WINOGRAD_STRATEGY( MEGDNN_REG_WINOGRAD_STRATEGY(
float, float, float, float, 6, 3, 4, 4, winograd_F63_mk4_f_nchw44) float, float, float, float, 6, 3, 4, 4, winograd_F63_mk4_f_nchw44)
MEGDNN_REG_WINOGRAD_STRATEGY(
float, float, float, float, 4, 3, 4, 4, winograd_F43_mk4_f_nchw44)
MEGDNN_REG_WINOGRAD_STRATEGY( MEGDNN_REG_WINOGRAD_STRATEGY(
float, float, float, float, 7, 3, 4, 4, winograd_F73_mk4_f_nchw44) float, float, float, float, 7, 3, 4, 4, winograd_F73_mk4_f_nchw44)
} // namespace winograd } // namespace winograd
......
#include "src/common/unroll_macro.h"
#include "src/common/utils.h"
#include "src/common/winograd/winograd_helper.h"
#include "src/fallback/conv_bias/gi/fp32/filter_transform.h"
#include "src/fallback/conv_bias/gi/fp32/helper.h"
#include "src/fallback/conv_bias/gi/fp32/strategy.h"
#include "src/fallback/conv_bias/winograd/winograd.h"
#include "src/fallback/elemwise_helper/op_unary.h"
#include "midout.h"
MIDOUT_DECL(megdnn_fallback_winograd_fp32_F43_4x4)
using namespace megdnn;
using namespace fallback;
namespace {
#define MLAF GiMultiplyAddScalarFloat32
#define MLSF GiMultiplySubScalarFloat32
struct InputTransform4X3 {
/**
* @brief Convert layout from NCHW to NCHW44(i.e. NC4HW4)
*
* @tparam inner Whether all data in [[ih_start, ih_start+6), [iw_start,
* iw_start+6)] is in @input
* @param input Pointer which points to all input data(CHW, exclude dim N)
* @param patch Buffer which size is sizeof(float) * 4 * 6 * 6. Continuous storage
* of data for the current block, order by C, H, W.
* @param patchT RETURN
* @param ih_start The start index of dim H of current block
* @param iw_start The start index of dim W of current block
* @param IH Dim H of input
* @param IW Dim W of input
* @param ic The index of dim C of input
* @param IC Dim C of input
*/
template <bool inner>
static void transpose(
const float* input, float* patch, float* patchT, int ih_start, int iw_start,
size_t IH, size_t IW, size_t ic, size_t IC) {
constexpr size_t alpha = 4 + 3 - 1;
if (!inner || ic + 4 > IC) {
memset(patch, 0, sizeof(float) * 4 * alpha * alpha);
}
if (inner) {
const float* input_ptr = input + ic * IH * IW + ih_start * IW + iw_start;
for (size_t ico = 0; ico < 4; ++ico) {
if (ic + ico < IC) {
#define cb(i) \
auto v##i##0 = GiLoadFloat32(input_ptr + i * IW); \
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha, v##i##0); \
auto v##i##1 = GiLoadFloat32LowHalf(input_ptr + i * IW + 4); \
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha + 4, v##i##1);
UNROLL_CALL_NOWRAPPER(6, cb);
#undef cb
input_ptr += IH * IW;
}
}
} else {
size_t ih0 = std::max(0, ih_start), ih1 = std::min(ih_start + alpha, IH),
iw0 = std::max(0, iw_start), iw1 = std::min(iw_start + alpha, IW);
for (size_t ico = 0; ico < 4 && ic + ico < IC; ++ico) {
for (size_t ih = ih0; ih < ih1; ++ih) {
for (size_t iw = iw0; iw < iw1; ++iw) {
patch[ico * alpha * alpha + (ih - ih_start) * alpha +
(iw - iw_start)] =
input[(ic + ico) * IH * IW + ih * IW + iw];
}
}
}
}
#define cb(i) transpose_4x4(patch + i * 4, patchT + i * 16, 36, 4);
UNROLL_CALL_NOWRAPPER(9, cb);
#undef cb
}
static void transform(
const float* patchT, float* input_transform_buf, size_t unit_idx,
size_t nr_units_in_tile, size_t ic, size_t IC) {
constexpr size_t alpha = 4 + 3 - 1;
#define cb(m, n) \
GI_FLOAT32_t d##m##n = GiLoadFloat32(patchT + m * alpha * 4 + n * 4), wd##m##n;
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb);
#undef cb
//! BT
//! 4 0 -5 0 1 0
//! 0 -4 -4 1 1 0
//! 0 4 -4 -1 1 0
//! 0 -2 -1 2 1 0
//! 0 2 -1 -2 1 0
//! 0 4 0 -5 0 1
//! wd0n = 4 * (d0n - d2n) + (d4n - d2n)
//! wd1n = (d3n + d4n) - 4 * (d1n + d2n)
//! wd2n = 4 * (d1n - d2n) + (d4n - d3n)
//! wd3n = (d4n - d2n) - 2 * (d1n - d3n)
//! wd4n = 2 * (d1n - d3n) + (d4n - d2n)
//! wd5n = 4 * (d1n - d3n) + (d5n - d3n)
#define cb(n) \
{ \
auto&& d4subd2 = SUBF(d4##n, d2##n); \
auto&& d1subd3 = SUBF(d1##n, d3##n); \
wd0##n = MLAF(d4subd2, SUBF(d0##n, d2##n), 4.0f); \
wd1##n = MLSF(ADDF(d3##n, d4##n), ADDF(d1##n, d2##n), 4.0f); \
wd2##n = MLAF(SUBF(d4##n, d3##n), SUBF(d1##n, d2##n), 4.0f); \
auto&& double_d1subd3 = MULSF(d1subd3, 2.0f); \
wd3##n = SUBF(d4subd2, double_d1subd3); \
wd4##n = ADDF(double_d1subd3, d4subd2); \
wd5##n = MLAF(SUBF(d5##n, d3##n), d1subd3, 4.0f); \
}
UNROLL_CALL_NOWRAPPER(6, cb);
#undef cb
//! B
//! 4 0 0 0 0 0
//! 0 -4 4 -2 2 4
//! -5 -4 -4 -1 -1 0
//! 0 1 -1 2 -2 -5
//! 1 1 1 1 1 0
//! 0 0 0 0 0 1
//! dm0 = 4 * (wdm0 - wdm2) + (wdm4 - wdm2)
//! dm1 = (wdm3 + wdm4) - 4 * (wdm1 + wdm2)
//! dm2 = 4 * (wdm1 - wdm2) + (wdm4 - wdm3)
//! dm3 = (wdm4 - wdm2) - 2 * (wdm1 - wdm3)
//! dm4 = 2 * (wdm1 - wdm3) + (wdm4 - wdm2)
//! dm5 = 4 * (wdm1 - wdm3) + (wdm5 - wdm3)
#define cb(m) \
{ \
auto&& wd4subwd2 = SUBF(wd##m##4, wd##m##2); \
auto&& wd1subwd3 = SUBF(wd##m##1, wd##m##3); \
d##m##0 = MLAF(wd4subwd2, SUBF(wd##m##0, wd##m##2), 4.0f); \
d##m##1 = MLSF(ADDF(wd##m##3, wd##m##4), ADDF(wd##m##1, wd##m##2), 4.0f); \
d##m##2 = MLAF(SUBF(wd##m##4, wd##m##3), SUBF(wd##m##1, wd##m##2), 4.0f); \
auto&& double_wd1subwd3 = MULSF(wd1subwd3, 2.0f); \
d##m##3 = SUBF(wd4subwd2, double_wd1subwd3); \
d##m##4 = ADDF(double_wd1subwd3, wd4subwd2); \
d##m##5 = MLAF(SUBF(wd##m##5, wd##m##3), wd1subwd3, 4.0f); \
}
UNROLL_CALL_NOWRAPPER(6, cb);
#undef cb
size_t ICB = IC / 4;
size_t icb = ic / 4;
#define cb(m, n) \
GiStoreFloat32( \
input_transform_buf + (m * alpha + n) * ICB * 4 * nr_units_in_tile + \
icb * nr_units_in_tile * 4 + unit_idx * 4, \
d##m##n);
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb);
#undef cb
}
}; // InputTransform4X3
template <BiasMode bmode, typename Op>
struct OutputTransform4X3 {
static void transform(
const float* output_transform_buf, const float* bias, float* output,
float* transform_mid_buf, size_t oh_start, size_t ow_start, size_t OH,
size_t OW, size_t oc_start, size_t oc_end, size_t oc_index, size_t unit_idx,
size_t nr_units_in_tile, const DType& src_dtype, const DType& dst_dtype) {
Op op(src_dtype, dst_dtype);
constexpr size_t alpha = 4 + 3 - 1;
size_t oc = oc_start + oc_index;
size_t OCB = (oc_end - oc_start) / 4;
size_t ocb = oc_index / 4;
#define cb(m, n) \
auto v##m##n = GiLoadFloat32( \
output_transform_buf + (m * alpha + n) * OCB * nr_units_in_tile * 4 + \
ocb * nr_units_in_tile * 4 + unit_idx * 4);
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb);
#undef cb
//! AT
//! 1 1 1 1 1 0
//! 0 1 -1 2 -2 0
//! 0 1 1 4 4 0
//! 0 1 -1 8 -8 1
//! t0n = v0n + (v1n + v2n) + (v3n + v4n)
//! t1n = (v1n - v2n) + 2 * (v3n - v4n)
//! t2n = (v1n + v2n) + 4 * (v3n + v4n)
//! t3n = (v1n - v2n) + 8 * (v3n - v4n) + v5n
#define cb(m, n) GI_FLOAT32_t t##m##n;
UNROLL_CALL_NOWRAPPER_D2(4, 6, cb);
#undef cb
#define cb(n) \
{ \
auto&& v1addv2 = ADDF(v1##n, v2##n); \
auto&& v1subv2 = SUBF(v1##n, v2##n); \
auto&& v3addv4 = ADDF(v3##n, v4##n); \
auto&& v3subv4 = SUBF(v3##n, v4##n); \
\
t0##n = ADDF(ADDF(v0##n, v1addv2), v3addv4); \
t1##n = MLAF(v1subv2, v3subv4, 2.0f); \
t2##n = MLAF(v1addv2, v3addv4, 4.0f); \
t3##n = ADDF(MLAF(v1subv2, v3subv4, 8.0f), v5##n); \
}
UNROLL_CALL_NOWRAPPER(6, cb);
#undef cb
//! A
//! 1 0 0 0
//! 1 1 1 1
//! 1 -1 1 -1
//! 1 2 4 8
//! 1 -2 4 -8
//! 0 0 0 1
// vm0 = tm0 + (tm1 + tm2) + (tm3 + tm4)
// vm1 = (tm1 - tm2) + 2 * (tm3 - tm4)
// vm2 = (tm1 + tm2) + 4 * (tm3 + tm4)
// vm3 = (tm1 - tm2) + 8 * (tm3 - tm4) + tm5
#define cb(m) \
{ \
auto&& t1addt2 = ADDF(t##m##1, t##m##2); \
auto&& t1subt2 = SUBF(t##m##1, t##m##2); \
auto&& t3addt4 = ADDF(t##m##3, t##m##4); \
auto&& t3subt4 = SUBF(t##m##3, t##m##4); \
v##m##0 = ADDF(ADDF(t##m##0, t1addt2), t3addt4); \
v##m##1 = MLAF(t1subt2, t3subt4, 2.0f); \
v##m##2 = MLAF(t1addt2, t3addt4, 4.0f); \
v##m##3 = ADDF(MLAF(t1subt2, t3subt4, 8.0f), t##m##5); \
}
UNROLL_CALL_NOWRAPPER(4, cb);
#undef cb
GI_FLOAT32_t vbias;
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
vbias = GiLoadFloat32(bias + oc);
#define cb(m, n) v##m##n = GiAddFloat32(v##m##n, vbias);
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
#undef cb
}
if (bmode != BiasMode::BIAS) {
#define cb(m, n) v##m##n = op(v##m##n);
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
#undef cb
}
#define cb(m, n) GiStoreFloat32(transform_mid_buf + (4 * m + n) * 4, v##m##n);
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
#undef cb
for (size_t oho = 0; oho < 4 && oh_start + oho < OH; ++oho) {
for (size_t owo = 0; owo < 4 && ow_start + owo < OW; ++owo) {
for (size_t oco = 0; oco < 4 && oc + oco < oc_end; ++oco) {
float res = transform_mid_buf[oho * 4 * 4 + owo * 4 + oco];
size_t oh = oh_start + oho;
size_t ow = ow_start + owo;
if (bmode == BiasMode::BIAS) {
res += bias[(oc + oco) * OH * OW + oh * OW + ow];
res = op(res);
}
output[(oc + oco) * OH * OW + oh * OW + ow] = res;
}
}
}
}
}; // OutputTransform4X3
#undef MLSF
#undef MLAF
} // namespace
namespace megdnn {
namespace fallback {
namespace winograd {
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_4x3_4x4_f)
void winograd_4x3_4x4_f::filter(
const float* filter, float* filter_transform_buf, float* transform_mid_buf,
size_t OC, size_t IC, size_t oc_start, size_t oc_end) {
FilterTransform4X3<megdnn::param::MatrixMul::Format::MK4>::transform(
filter, filter_transform_buf, transform_mid_buf, OC, IC, oc_start, oc_end);
}
void winograd_4x3_4x4_f::input(
const float* input, float* input_transform_buf, float* transform_mid_buf,
size_t IH, size_t IW, size_t IC, size_t PH, size_t PW, size_t unit_start_idx,
size_t nr_units_in_tile) {
megdnn_assert(IC % 4 == 0);
auto unit_w = div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE);
float* patch = transform_mid_buf;
float* patchT = transform_mid_buf + 4 * ALPHA * ALPHA;
for (size_t ic = 0; ic < IC; ic += 4) {
for (size_t unit_idx = 0; unit_idx < nr_units_in_tile; ++unit_idx) {
size_t index = unit_start_idx + unit_idx;
size_t oht = index / unit_w;
size_t owt = index % unit_w;
int ih_start = static_cast<int>(oht * OUTPUT_BLOCK_SIZE - PH);
int iw_start = static_cast<int>(owt * OUTPUT_BLOCK_SIZE - PW);
if (ih_start >= 0 && ih_start + 6 <= static_cast<int>(IH) &&
iw_start >= 0 && iw_start + 6 <= static_cast<int>(IW)) {
InputTransform4X3::transpose<true>(
input, patch, patchT, ih_start, iw_start, IH, IW, ic, IC);
} else {
InputTransform4X3::transpose<false>(
input, patch, patchT, ih_start, iw_start, IH, IW, ic, IC);
}
InputTransform4X3::transform(
patchT, input_transform_buf, unit_idx, nr_units_in_tile, ic, IC);
}
}
}
void winograd_4x3_4x4_f::output(
const float* output_transform_buf, const float* bias, float* output,
float* transform_mid_buf, BiasMode bmode, NonlineMode nonline_mode, size_t OH,
size_t OW, size_t oc_start, size_t oc_end, size_t unit_start_idx,
size_t nr_units_in_tile) {
#define cb(_bmode, _nonline_mode, ...) \
OutputTransform4X3<_bmode, _nonline_mode>::transform(__VA_ARGS__);
auto unit_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE);
for (size_t oc = oc_start; oc < oc_end; oc += 4) {
size_t oc_index = oc - oc_start;
for (size_t unit_idx = 0; unit_idx < nr_units_in_tile; ++unit_idx) {
size_t index = unit_idx + unit_start_idx;
size_t oht = index / unit_w;
size_t owt = index % unit_w;
size_t oh_start = oht * OUTPUT_BLOCK_SIZE;
size_t ow_start = owt * OUTPUT_BLOCK_SIZE;
GI_DISPATCH_CONV_WINOGRAD_BIAS(
megdnn_fallback_winograd_fp32_F43_4x4, cb, float, float, bmode,
nonline_mode, output_transform_buf, bias, output, transform_mid_buf,
oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, unit_idx,
nr_units_in_tile, src_dtype, dst_dtype);
}
}
#undef cb
}
} // namespace winograd
} // namespace fallback
} // namespace megdnn
\ No newline at end of file
...@@ -121,7 +121,7 @@ public: ...@@ -121,7 +121,7 @@ public:
for (auto&& algo : matmul_algos) { for (auto&& algo : matmul_algos) {
if (is_naive(algo)) if (is_naive(algo))
continue; continue;
for (uint32_t tile_size : {16, 8, 24, 32}) { for (uint32_t tile_size : {16, 8, 24, 32, 48, 68}) {
refhold.emplace_back(new AlgoFP32WinogradF23_4x4( refhold.emplace_back(new AlgoFP32WinogradF23_4x4(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size)); tile_size));
...@@ -130,10 +130,18 @@ public: ...@@ -130,10 +130,18 @@ public:
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size)); tile_size));
m_gi_winograd_algos.emplace_back(refhold.back().get()); m_gi_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF43_4x4(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_gi_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size)); tile_size));
m_gi_winograd_algos.emplace_back(refhold.back().get()); m_gi_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF43_4x4_NCHW44(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_gi_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size)); tile_size));
......
...@@ -219,9 +219,11 @@ public: ...@@ -219,9 +219,11 @@ public:
GI_COMMON_WINOGRAD_F63_FP32, GI_COMMON_WINOGRAD_F63_FP32,
GI_COMMON_WINOGRAD_F43_FP32, GI_COMMON_WINOGRAD_F43_FP32,
GI_COMMON_WINOGRAD_F63_4X4_FP32, GI_COMMON_WINOGRAD_F63_4X4_FP32,
GI_COMMON_WINOGRAD_F43_4X4_FP32,
GI_COMMON_WINOGRAD_F54_FP32, GI_COMMON_WINOGRAD_F54_FP32,
GI_COMMON_WINOGRAD_F45_FP32, GI_COMMON_WINOGRAD_F45_FP32,
GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32, GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32,
GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32,
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32, GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32,
GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32, GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32,
GI_COMMON_DIRECT_FP32, GI_COMMON_DIRECT_FP32,
...@@ -382,9 +384,11 @@ private: ...@@ -382,9 +384,11 @@ private:
class AlgoFP32WinogradF63; class AlgoFP32WinogradF63;
class AlgoFP32WinogradF43; class AlgoFP32WinogradF43;
class AlgoFP32WinogradF63_4x4; class AlgoFP32WinogradF63_4x4;
class AlgoFP32WinogradF43_4x4;
class AlgoFP32WinogradF54; class AlgoFP32WinogradF54;
class AlgoFP32WinogradF45; class AlgoFP32WinogradF45;
class AlgoFP32WinogradF23_4x4_NCHW44; class AlgoFP32WinogradF23_4x4_NCHW44;
class AlgoFP32WinogradF43_4x4_NCHW44;
class AlgoFP32WinogradF63_4x4_NCHW44; class AlgoFP32WinogradF63_4x4_NCHW44;
class AlgoFP32WinogradF73_4x4_NCHW44; class AlgoFP32WinogradF73_4x4_NCHW44;
......
...@@ -1013,6 +1013,27 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F43_F63) { ...@@ -1013,6 +1013,27 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F43_F63) {
handle(), 3); handle(), 3);
#endif #endif
} }
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_44_F43_F23) {
#if MEGDNN_AARCH64
benchmark_winograd_compare(
"WINOGRAD:.*:4:4:.*:3", "WINOGRAD:.*:4:2", handle(), 3, 4);
#endif
}
TEST_F(ARM_COMMON, BENCHMARK_WINOGRAD_F43_44) {
#if MEGDNN_AARCH64
benchmark_winograd_weight_preprocess("WINOGRAD:.*:4:4:.*:3", handle(), 3, 4);
#endif
}
TEST_F(ARM_COMMON, BENCHMARK_WINOGRAD_F43_NCHW44) {
#if MEGDNN_AARCH64
benchmark_winograd_weight_preprocess(
"WINOGRAD_NCHW44:.*:4:4:.*:3", handle(), 3, 4, 4);
#endif
}
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) { TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) {
#if MEGDNN_AARCH64 #if MEGDNN_AARCH64
benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3); benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3);
......
...@@ -902,7 +902,8 @@ void check_conv_bias( ...@@ -902,7 +902,8 @@ void check_conv_bias(
} }
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
std::vector<conv_bias::TestArg> get_winograd_benchmark_args( std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
size_t kernel, size_t pack_size) { size_t kernel, size_t pack_size, size_t io_pack_size) {
megdnn_assert(io_pack_size == 1 || io_pack_size == 4);
std::vector<conv_bias::TestArg> args; std::vector<conv_bias::TestArg> args;
auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) { auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
if (ic % pack_size != 0 || oc % pack_size != 0) if (ic % pack_size != 0 || oc % pack_size != 0)
...@@ -915,11 +916,20 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( ...@@ -915,11 +916,20 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
param.pad_h = p; param.pad_h = p;
param.pad_w = p; param.pad_w = p;
if (io_pack_size == 4) {
param.format = param::ConvBias::Format::NCHW44;
args.push_back(conv_bias::TestArg{
param,
TensorShape{1, ic / 4, h, w, 4},
TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
{1, oc / 4, 1, 1, 4}});
} else {
args.push_back(conv_bias::TestArg{ args.push_back(conv_bias::TestArg{
param, param,
TensorShape{1, ic, h, w}, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{oc, ic, kernel, kernel},
{1, oc, 1, 1}}); {1, oc, 1, 1}});
}
}; };
for (size_t ic : {8, 16, 32, 64}) { for (size_t ic : {8, 16, 32, 64}) {
...@@ -950,8 +960,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( ...@@ -950,8 +960,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
} }
void benchmark_winograd( void benchmark_winograd(
const char* algo_name, Handle* handle, size_t kernel, size_t pack_size) { const char* algo_name, Handle* handle, size_t kernel, size_t pack_size,
auto&& args = get_winograd_benchmark_args(kernel, pack_size); size_t io_pack_size) {
auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size);
using namespace conv_bias; using namespace conv_bias;
constexpr size_t RUN = 10; constexpr size_t RUN = 10;
Benchmarker<Convolution> benchmark(handle); Benchmarker<Convolution> benchmark(handle);
...@@ -969,10 +980,17 @@ void benchmark_winograd( ...@@ -969,10 +980,17 @@ void benchmark_winograd(
opr->deduce_layout( opr->deduce_layout(
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
{arg.bias, dtype::Float32()}, {}, dst_layout); {arg.bias, dtype::Float32()}, {}, dst_layout);
float computations = 0.0;
if (io_pack_size == 1) {
//! dst.nr_elems * IC * FH * FW * 2 //! dst.nr_elems * IC * FH * FW * 2
float computations = dst_layout.total_nr_elems() * arg.filter[1] * computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
arg.filter[2] * arg.filter[3] * 2.0 / arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3;
(1024 * 1024 * 1024) * 1e3; } else {
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) *
1e3;
}
param::Convolution conv_param; param::Convolution conv_param;
conv_param.pad_h = arg.param.pad_h; conv_param.pad_h = arg.param.pad_h;
...@@ -999,9 +1017,9 @@ void benchmark_winograd( ...@@ -999,9 +1017,9 @@ void benchmark_winograd(
// usage of weight pre-processing for winograd benchmark // usage of weight pre-processing for winograd benchmark
void benchmark_winograd_weight_preprocess( void benchmark_winograd_weight_preprocess(
const char* algo_name, megdnn::Handle* handle, size_t kernel, const char* algo_name, megdnn::Handle* handle, size_t kernel, size_t pack_size,
size_t pack_size) { size_t io_pack_size) {
auto&& args = get_winograd_benchmark_args(kernel, pack_size); auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size);
using namespace conv_bias; using namespace conv_bias;
constexpr size_t RUN = 10; constexpr size_t RUN = 10;
...@@ -1018,16 +1036,17 @@ void benchmark_winograd_weight_preprocess( ...@@ -1018,16 +1036,17 @@ void benchmark_winograd_weight_preprocess(
opr->deduce_layout( opr->deduce_layout(
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
{arg.bias, dtype::Float32()}, {}, dst_layout); {arg.bias, dtype::Float32()}, {}, dst_layout);
float computations = 0.0;
if (io_pack_size == 1) {
//! dst.nr_elems * IC * FH * FW * 2 //! dst.nr_elems * IC * FH * FW * 2
float computations = dst_layout.total_nr_elems() * arg.filter[1] * computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
arg.filter[2] * arg.filter[3] * 2.0 / arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3;
(1024 * 1024 * 1024) * 1e3; } else {
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
param::Convolution conv_param; computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
conv_param.pad_h = arg.param.pad_h; arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) *
conv_param.pad_w = arg.param.pad_w; 1e3;
conv_param.stride_h = arg.param.stride_h; }
conv_param.stride_w = arg.param.stride_w;
benchmark_winograd.set_param(arg.param); benchmark_winograd.set_param(arg.param);
auto used_winograd = auto used_winograd =
...@@ -1045,8 +1064,8 @@ void benchmark_winograd_weight_preprocess( ...@@ -1045,8 +1064,8 @@ void benchmark_winograd_weight_preprocess(
void benchmark_winograd_compare( void benchmark_winograd_compare(
const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, const char* algoA_name, const char* algoB_name, megdnn::Handle* handle,
size_t kernel, size_t pack_size) { size_t kernel, size_t pack_size, size_t io_pack_size) {
auto&& args = get_winograd_benchmark_args(kernel, pack_size); auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size);
using namespace conv_bias; using namespace conv_bias;
constexpr size_t RUN = 10; constexpr size_t RUN = 10;
...@@ -1062,16 +1081,17 @@ void benchmark_winograd_compare( ...@@ -1062,16 +1081,17 @@ void benchmark_winograd_compare(
opr->deduce_layout( opr->deduce_layout(
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
{arg.bias, dtype::Float32()}, {}, dst_layout); {arg.bias, dtype::Float32()}, {}, dst_layout);
float computations = 0.0;
if (io_pack_size == 1) {
//! dst.nr_elems * IC * FH * FW * 2 //! dst.nr_elems * IC * FH * FW * 2
float computations = dst_layout.total_nr_elems() * arg.filter[1] * computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
arg.filter[2] * arg.filter[3] * 2.0 / arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3;
(1024 * 1024 * 1024) * 1e3; } else {
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
param::Convolution conv_param; computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] *
conv_param.pad_h = arg.param.pad_h; arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) *
conv_param.pad_w = arg.param.pad_w; 1e3;
conv_param.stride_h = arg.param.stride_h; }
conv_param.stride_w = arg.param.stride_w;
benchmark_winograd.set_param(arg.param); benchmark_winograd.set_param(arg.param);
auto used_winograd1 = auto used_winograd1 =
......
...@@ -62,16 +62,16 @@ void check_conv_bias( ...@@ -62,16 +62,16 @@ void check_conv_bias(
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
std::vector<conv_bias::TestArg> get_winograd_benchmark_args( std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
size_t kernel, size_t pack_size = 1); size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1);
void benchmark_winograd( void benchmark_winograd(
const char* algo_name, megdnn::Handle* handle, size_t kernel, const char* algo_name, megdnn::Handle* handle, size_t kernel,
size_t pack_size = 1); size_t pack_size = 1, size_t io_pack_size = 1);
void benchmark_winograd_weight_preprocess( void benchmark_winograd_weight_preprocess(
const char* algo_name, megdnn::Handle* handle, size_t kernel, const char* algo_name, megdnn::Handle* handle, size_t kernel,
size_t pack_size = 1); size_t pack_size = 1, size_t io_pack_size = 1);
void benchmark_winograd_compare( void benchmark_winograd_compare(
const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, const char* algoA_name, const char* algoB_name, megdnn::Handle* handle,
size_t kernel, size_t pack_size = 1); size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1);
#endif // MEGDNN_WITH_BENCHMARK #endif // MEGDNN_WITH_BENCHMARK
template <class Checker> template <class Checker>
void check_winograd( void check_winograd(
......
...@@ -597,6 +597,25 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) { ...@@ -597,6 +597,25 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
param::ConvBias::Format::NCHW44); param::ConvBias::Format::NCHW44);
} }
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_NCHW44) {
using namespace conv_bias;
std::vector<TestArg> args =
get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
Checker<ConvBiasForward> checker(handle());
check_winograd(
"4:4:16", checker, args, param::MatrixMul::Format::MK4,
param::ConvBias::Format::NCHW44);
}
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_WEIGHT_PREPROCESS) {
using namespace conv_bias;
std::vector<TestArg> args = get_winograd_mk_packed_args();
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle());
check_winograd("4:4:16", checker, args, param::MatrixMul::Format::MK4);
}
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) { TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) {
using namespace conv_bias; using namespace conv_bias;
std::vector<TestArg> args = get_winograd_args(4); std::vector<TestArg> args = get_winograd_args(4);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册