diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py index 7c214a8bc06f6d267e1600c8c3fe53fa0f6619b8..d60153d400227e768c73d45f17a99da58c3cf9d3 100755 --- a/dnn/scripts/opr_param_defs.py +++ b/dnn/scripts/opr_param_defs.py @@ -445,7 +445,8 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0) 'uint32', Doc('output_block_size', 'output block size, detail meaning see winograd ' 'in convbias, equals to the meaning of m in F(m, r)'), 0). - add_enum_alias('Format', 'MatrixMul') + add_enum_alias('Format', 'MatrixMul'). + add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode') ) (pdef('SVD'). diff --git a/dnn/src/arm_common/conv_bias/int8/algos.cpp b/dnn/src/arm_common/conv_bias/int8/algos.cpp index f6d7022e93174a7777a34b30084e08952b60c8f7..348d7faee8d64cb180240676bac87b788e496f7a 100644 --- a/dnn/src/arm_common/conv_bias/int8/algos.cpp +++ b/dnn/src/arm_common/conv_bias/int8/algos.cpp @@ -273,5 +273,167 @@ ConvBiasImpl::AlgoS8WinogradF23_8x8::dispatch_kerns( MIDOUT_END(); return {}; } +//=========================== input int8 compute float32 ========= +bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( + fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy /*algo_selection_strategy*/) const { + MEGDNN_MARK_USED_VAR(param); + MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8, + midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::usable"_hash)) { + if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) + return false; + bool is_matmul_usable = false; + + using Strategy = winograd::winograd_2x3_4x4_s8_f32_nchw44; + Strategy strategy(param.src_type, param.filter_type, param.dst_type); + is_matmul_usable = m_matmul_algo->usable( + megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg) + .get_matmul_kern_param(param)); + return is_matmul_usable && + ((opr->param().format == param::ConvBias::Format::NCHW44 && + param.filter_type.enumv() == DTypeEnum::QuantizedS8) || + ((opr->param().format == + param::ConvBias::Format::NCHW44_WINOGRAD) && + opr->param().output_block_size == 2 && + param.winograd_matmul_format == + param::MatrixMul::Format::MK4)) && + opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && + (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && + param.filter_meta.spatial[0] == 3) && + (param.filter_meta.stride[0] == param.filter_meta.stride[1] && + param.filter_meta.stride[0] == 1) && + (param.filter_meta.dilation[0] == + param.filter_meta.dilation[1] && + param.filter_meta.dilation[0] == 1) && + (param.compute_mode == param::ConvBias::ComputeMode::FLOAT32 || + param.compute_mode == param::ConvBias::ComputeMode::DEFAULT) && + param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.bias_type.enumv() == DTypeEnum::QuantizedS32 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8; + } + MIDOUT_END(); + return false; +} + +size_t ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::get_workspace( + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { + MIDOUT_BEGIN( + megdnn_arm_common_conv_bias_int8, + midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::get_workspace"_hash)) { + winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( + param.src_type, param.filter_type, param.dst_type); + return megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg) + .get_workspace_size(param, m_matmul_algo); + } + MIDOUT_END(); + return 0; +} + +SmallVector +ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::dispatch_kerns( + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { + MEGDNN_MARK_USED_VAR(param); + MIDOUT_BEGIN( + megdnn_arm_common_conv_bias_int8, + midout_iv( + "arm_common_AlgoS8CF32WinogradF23_4x4::dispatch_kerns"_hash)) { + winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( + param.src_type, param.filter_type, param.dst_type); + auto winograd_impl = + megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg); + return winograd_impl.get_kerns(param, m_matmul_algo); + } + MIDOUT_END(); + return {}; +} +/* ======================= AlgoS8WinogradF23_8x8_NCHW44 ======================== */ +bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( + fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy /*algo_selection_strategy*/) const { + + MIDOUT_BEGIN( + megdnn_arm_common_conv_bias_int8, + midout_iv( + "arm_common_AlgoS8WinogradF23_8x8_NCHW44::usable"_hash)) { + if (param.filter_meta.icpg % 8 != 0 || param.filter_meta.ocpg % 8 != 0) + return false; + using Strategy = winograd::winograd_2x3_8x8_s8_nchw44; + Strategy strategy(param.src_type, param.filter_type, param.dst_type); + auto&& matmul_param = + megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg) + .get_matmul_kern_param(param); + bool is_matmul_usable = m_matmul_algo->usable(matmul_param); + return is_matmul_usable && + ((opr->param().format == param::ConvBias::Format::NCHW44 && + param.filter_type.enumv() == DTypeEnum::QuantizedS8) || + (opr->param().format == param::ConvBias::Format::NCHW44_WINOGRAD && + opr->param().output_block_size == 2 && + param.winograd_matmul_format == param::MatrixMul::Format::MK8 && + param.filter_type.enumv() == DTypeEnum::QuantizedS16)) && + opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && + (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && + param.filter_meta.spatial[0] == 3) && + (param.filter_meta.stride[0] == param.filter_meta.stride[1] && + param.filter_meta.stride[0] == 1) && + (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && + param.filter_meta.dilation[0] == 1) && + param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && + param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.bias_type.enumv() == DTypeEnum::QuantizedS32 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8; + } + MIDOUT_END(); + return false; +} + +size_t ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::get_workspace( + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { + MIDOUT_BEGIN( + megdnn_arm_common_conv_bias_int8, + midout_iv( + "arm_common_AlgoS8WinogradF23_8x8_NCHW44::get_workspace"_hash)) { + winograd::winograd_2x3_8x8_s8_nchw44 strategy( + param.src_type, param.filter_type, param.dst_type); + return megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg) + .get_workspace_size(param, m_matmul_algo); + } + MIDOUT_END(); + return 0; +} + +SmallVector +ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns( + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { + MIDOUT_BEGIN( + megdnn_arm_common_conv_bias_int8, + midout_iv( + "arm_common_AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns"_hash)) { + winograd::winograd_2x3_8x8_s8_nchw44 strategy( + param.src_type, param.filter_type, param.dst_type); + auto winograd_impl = + megdnn::winograd::ConvBias( + strategy, m_tile_size, param.nr_threads, param.osz[0], + param.osz[1], param.filter_meta.ocpg); + return winograd_impl.get_kerns(param, m_matmul_algo); + } + MIDOUT_END(); + return {}; +} // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/algos.h b/dnn/src/arm_common/conv_bias/int8/algos.h index 5b2629dd5e8cce3c492a47a010267b38f21a5dad..fc5dcad5c27d5f31c2c5739fce8906453fa45678 100644 --- a/dnn/src/arm_common/conv_bias/int8/algos.h +++ b/dnn/src/arm_common/conv_bias/int8/algos.h @@ -220,6 +220,68 @@ private: uint32_t m_tile_size; }; +//=======================input int8 compute fp32 output int8============ +class ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44 final : public AlgoBase { +public: + AlgoS8CF32WinogradF23_4x4_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo, + uint32_t tile_size) + : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} + bool is_reproducible() const override { return true; } + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasImpl::algo_name( + m_matmul_algo->name(), {4, 2, m_tile_size}, + param::ConvBias::Format::NCHW44); + } + return m_name.c_str(); + } + bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy algo_selection_strategy) const override; + size_t get_workspace(fallback::ConvBiasImpl*, + const NCBKernSizeParam& param) const override; + virtual SmallVector dispatch_kerns( + fallback::ConvBiasImpl* opr, + const NCBKernSizeParam& param) const override; + static std::vector + get_avaiable_matmul_algos(const NCBKernSizeParam& param); + +private: + fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; + mutable std::string m_name; + uint32_t m_tile_size; +}; + +//=======================input int8 compute int16 output int8============ +class ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44 final : public AlgoBase { +public: + AlgoS8WinogradF23_8x8_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo, + uint32_t tile_size) + : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} + bool is_reproducible() const override { return true; } + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasImpl::algo_name( + m_matmul_algo->name(), {8, 2, m_tile_size}, + param::ConvBias::Format::NCHW44); + } + return m_name.c_str(); + } + bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy algo_selection_strategy) const override; + size_t get_workspace(fallback::ConvBiasImpl*, + const NCBKernSizeParam& param) const override; + virtual SmallVector dispatch_kerns( + fallback::ConvBiasImpl* opr, + const NCBKernSizeParam& param) const override; + static std::vector + get_avaiable_matmul_algos(const NCBKernSizeParam& param); + +private: + fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; + mutable std::string m_name; + uint32_t m_tile_size; +}; + } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/conv_bias/int8/strategy.h b/dnn/src/arm_common/conv_bias/int8/strategy.h index 717f99da6fba8844c472b74cf50d11d5ceed8ecb..f4bc4a334dac79606f2658b41f40b21cf0a25d97 100644 --- a/dnn/src/arm_common/conv_bias/int8/strategy.h +++ b/dnn/src/arm_common/conv_bias/int8/strategy.h @@ -20,6 +20,10 @@ namespace winograd { MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8, winograd_2x3_8x8_s8) +MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8, + winograd_2x3_8x8_s8_nchw44) +MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, float, float, 2, 3, 4, 4, + winograd_2x3_4x4_s8_f32_nchw44) } } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3177fd1dd3a4ef3b99450e09c462ab5ff00923f8 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp @@ -0,0 +1,372 @@ +/** + * \file dnn/src/arm_common/conv_bias/fp32/strategy_nchw44_2x3_4x4.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/arm_common/conv_bias/int8/strategy.h" +#include "src/arm_common/simd_macro/marm_neon.h" +#include "src/arm_common/utils.h" +#include "src/common/unroll_macro.h" +#include "src/common/utils.h" +#include "src/fallback/conv_bias/winograd/winograd.h" + +#include "src/arm_common/conv_bias/winograd_common/winograd_common.h" +#include "src/naive/matrix_mul/matrix_mul_helper.h" +#include "src/arm_common/elemwise_helper/op_unary.h" +#include "src/arm_common/conv_bias/fp32/helper.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23) + +using namespace megdnn; +using namespace arm_common; +namespace { +struct InputTransform2X3 { + template + static void prepare(const int8_t* input, float* patch, float* patchT, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t ic, size_t IC, size_t PH, size_t PW) { + megdnn_assert( + ic % 4 == 0 && IC % 4 == 0, + "Winograd input prepare param is not times of 4!"); + constexpr size_t alpha = 2 + 3 - 1; + MEGDNN_MARK_USED_VAR(patch); + if (inner) { + const int8_t* input_ptr = + input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4; + for (size_t ico = 0; ico < 4; ++ico) { + int8x16_t v_input = vld1q_s8(input_ptr); + int16x8_t v_low = vmovl_s8(vget_low_s8(v_input)); + int16x8_t v_high = vmovl_s8(vget_high_s8(v_input)); + int32x4_t v_0 = vmovl_s16(vget_low_s16(v_low)); + int32x4_t v_1 = vmovl_s16(vget_high_s16(v_low)); + int32x4_t v_2 = vmovl_s16(vget_low_s16(v_high)); + int32x4_t v_3 = vmovl_s16(vget_high_s16(v_high)); + + vst1q_f32(patchT + ico * 4 * alpha + 0 * 4, + vcvtq_f32_s32(v_0)); + vst1q_f32(patchT + ico * 4 * alpha + 1 * 4, + vcvtq_f32_s32(v_1)); + vst1q_f32(patchT + ico * 4 * alpha + 2 * 4, + vcvtq_f32_s32(v_2)); + vst1q_f32(patchT + ico * 4 * alpha + 3 * 4, + vcvtq_f32_s32(v_3)); + input_ptr += IW * 4; + } + } else { + if (PH > 0 || PW > 0) { + memset(patchT, 0, sizeof(float) * 4 * alpha * alpha); + } + InputGetter getter; + const int8_t* input_ptr = input + ic * IH * IW; + int ih0_act = std::max(ih_start, 0), + ih1_act = std::min(ih_start + alpha, IH), + iw0_act = std::max(iw_start, 0), + iw1_act = std::min(iw_start + alpha, IW); + // partial copy + for (int ih = ih0_act; ih < ih1_act; ++ih) { + for (int iw = iw0_act; iw < iw1_act; ++iw) { + size_t iho = ih - ih_start, iwo = iw - iw_start; + vst1q_f32(patchT + iho * alpha * 4 + iwo * 4, + getter(input_ptr + ih * IW * 4 + iw * 4)); + } + } + } + } + + static void transform(const float* patchT, float* input_transform_buf, + size_t unit_idx, size_t nr_units_in_tile, size_t ic, + size_t IC) { + constexpr size_t alpha = 2 + 3 - 1; + // BT * d * B +#define cb(m, n) \ + Vector d##m##n = \ + Vector::load(patchT + m * 4 * 4 + n * 4); + + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); +#undef cb + + //! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 + //! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 + //! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 + //! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 +#define cb(m) \ + auto t0##m = d0##m - d2##m; \ + auto t1##m = d1##m + d2##m; \ + auto t2##m = d2##m - d1##m; \ + auto t3##m = d3##m - d1##m; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + +#define cb(m) \ + d##m##0 = t##m##0 - t##m##2; \ + d##m##1 = t##m##1 + t##m##2; \ + d##m##2 = t##m##2 - t##m##1; \ + d##m##3 = t##m##3 - t##m##1; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + + size_t ICB = IC / 4; + size_t icb = ic / 4; +#define cb(m, n) \ + d##m##n.save(input_transform_buf + \ + (m * alpha + n) * ICB * nr_units_in_tile * 4 + \ + icb * nr_units_in_tile * 4 + unit_idx * 4); + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) +#undef cb + } +}; + +template +struct OutputTransform2X3 { + static void transform(const float* output_transform_buf, const float* bias, + int8_t* output, float* transform_mid_buf, + size_t oh_start, size_t ow_start, size_t OH, + size_t OW, size_t oc_start, size_t oc_end, + size_t oc_index, size_t unit_idx, + size_t nr_units_in_tile, const DType& src_dtype, + const DType& filter_dtype, const DType& dst_dtype) { + float scale_filter = 0.f; + MEGDNN_MARK_USED_VAR(transform_mid_buf); + if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) { + scale_filter = filter_dtype.param().scale; + } else if (filter_dtype.enumv() == DTypeEnum::QuantizedS32) { + megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS32); + scale_filter = filter_dtype.param().scale; + } + float input_filter_scale = + src_dtype.param().scale * scale_filter; + DType buffer_dtype = dtype::QuantizedS32(input_filter_scale); + Op op(buffer_dtype, dst_dtype); + + //! AT * m * A + constexpr size_t alpha = 2 + 3 - 1; + + size_t oc = oc_start + oc_index; + size_t OCB = (oc_end - oc_start) / 4; + size_t ocb = oc_index / 4; + +#define cb(m, n) \ + auto v##m##n = Vector::load( \ + output_transform_buf + \ + (m * alpha + n) * OCB * nr_units_in_tile * 4 + \ + ocb * nr_units_in_tile * 4 + unit_idx * 4); + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); +#undef cb + //! 1 1 1 0 v00 v01 v02 v03 1 0 + //! 0 1 -1 1 v10 v11 v12 v13 1 1 + //! v20 v21 v22 v23 1 -1 + //! v30 v31 v32 v33 0 1 +#define cb(m) \ + auto t0##m = v0##m + v1##m + v2##m; \ + auto t1##m = v1##m - v2##m + v3##m; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + + Vector result[2][2]; + + result[0][0] = t00 + t01 + t02; + result[1][0] = t10 + t11 + t12; + result[0][1] = t01 - t02 + t03; + result[1][1] = t11 - t12 + t13; + + const int32_t* tmp_bias = + static_cast(static_cast(bias)); + Vector vbias; + if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { + const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32(tmp_bias + oc)); + vbias = Vector(vvbias); + + result[0][0] += vbias; + result[0][1] += vbias; + result[1][0] += vbias; + result[1][1] += vbias; + } + +#undef cb + +#if MEGDNN_AARCH64 + int32_t* tmp_ouput = static_cast(static_cast(output)); +#endif + for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) { + for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) { + size_t oh = oh_start + oho; + size_t ow = ow_start + owo; + + Vector res; + res = result[oho][owo]; + if (bmode == BiasMode::BIAS) { + const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32( + tmp_bias + oc * OH * OW + oh * OW * 4 + ow * 4)); + res += Vector(vvbias); + } +#if MEGDNN_AARCH64 + int8x8_t v_res = op(res.value); + tmp_ouput[oc * OH * OW / 4 + oh * OW + ow] = + vget_lane_s32(vreinterpret_s32_s8(v_res), 0); +#else + //! armv7 using neon there is some error ,so using scalar + //! compute + dt_qint8 res_int8 = dt_qint8(0); +#define cb(i) \ + res_int8 = op(dt_qint32(vgetq_lane_f32(res.value, i))); \ + output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb +#endif + } + } + } +}; +} // namespace + +namespace megdnn { +namespace arm_common { +namespace winograd { + +MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_4x4_s8_f32_nchw44) +void winograd_2x3_4x4_s8_f32_nchw44::filter(const int8_t* filter, + float* filter_transform_buf, + float* transform_mid_buf, size_t OC, size_t IC, + size_t oc_start, size_t oc_end) { + constexpr int alpha = 2 + 3 - 1; + + /** + * origin: (4x3) * (3 x 3) * (3 x 4) + */ + //! 1 0 0 v00 v01 v02 1 0.5 0.5 0 + //! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 + //! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 + //! 0 0 1 + + InputGetter getter; + MEGDNN_MARK_USED_VAR(transform_mid_buf); + megdnn_assert((oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 && + oc_end % 4 == 0 && IC % 4 == 0 && OC % 4 == 0, + "Winograd filter transform input param is not times of 4!"); + size_t OCB = OC / 4; + size_t ICB = IC / 4; + + for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) { + for (size_t icb = 0; icb < ICB; icb++) { + for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) { + const int8_t* fptr = filter + (ocb * ICB + icb) * 3 * 3 * 4 * 4 + + ic_inner * 4; + +#define cb(m, n) \ + Vector g##m##n = \ + Vector(getter(fptr + (m * 3 + n) * 4 * 4)); + + UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) +#undef cb + +#define FILTER_TRANSFORM(n, wd, g) \ + auto wd##n##0 = g##0##n; \ + tmp0 = (g##0##n + g##2##n) * 0.5; \ + tmp1 = g##1##n * 0.5; \ + auto wd##n##1 = tmp0 + tmp1; \ + auto wd##n##2 = tmp0 - tmp1; \ + auto wd##n##3 = g##2##n; + Vector tmp0, tmp1; + UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); + UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); +#undef FILTER_TRANSFORM + + +#define cb(m, n) \ + ret##m##n.save(filter_transform_buf + \ + (m * alpha + n) * OCB * ICB * 4 * 4 + ocb * ICB * 4 * 4 + \ + icb * 4 * 4 + ic_inner * 4); + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) +#undef cb + } + } + } +} + +void winograd_2x3_4x4_s8_f32_nchw44::input(const int8_t* input, float* input_transform_buf, + float* transform_mid_buf, size_t IH, size_t IW, + size_t IC, size_t PH, size_t PW, + size_t unit_start_idx, + size_t nr_units_in_tile) { + megdnn_assert(IC % 4 == 0); + constexpr int alpha = 3 + 2 - 1; + + auto units_w = + div_ceil(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); + float* patch = transform_mid_buf; + float* patchT = transform_mid_buf + 4 * alpha * alpha; + + for (size_t ic = 0; ic < IC; ic += 4) { + rep(unit_idx, nr_units_in_tile) { + size_t index = unit_start_idx + unit_idx; + size_t nh = index / units_w; + size_t nw = index % units_w; + int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; + int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; + if (ih_start >= 0 && ih_start + alpha <= static_cast(IH) && + iw_start >= 0 && iw_start + alpha <= static_cast(IW)) { + InputTransform2X3::prepare(input, patch, patchT, ih_start, + iw_start, IH, IW, ic, IC,PH,PW); + InputTransform2X3::transform(patchT, input_transform_buf, + unit_idx, nr_units_in_tile, ic, + IC); + + } else { + InputTransform2X3::prepare(input, patch, patchT, + ih_start, iw_start, IH, IW, + ic, IC,PH,PW); + InputTransform2X3::transform(patchT, input_transform_buf, + unit_idx, nr_units_in_tile, ic, + IC); + } + } + } +} + +void winograd_2x3_4x4_s8_f32_nchw44::output(const float* output_transform_buf, + const float* bias, int8_t* output, + float* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t OH, size_t OW, + size_t oc_start, size_t oc_end, + size_t unit_start_idx, + size_t nr_units_in_tile) { +#define cb(_bmode, _nonline_op, ...) \ + OutputTransform2X3<_bmode MEGDNN_COMMA _nonline_op>::transform(__VA_ARGS__); + + auto units_w = div_ceil(OW, OUTPUT_BLOCK_SIZE); + for (size_t oc = oc_start; oc < oc_end; oc += 4) { + size_t oc_index = oc - oc_start; + rep(unit_idx, nr_units_in_tile) { + size_t index = unit_start_idx + unit_idx; + auto nh = index / units_w; + auto nw = index % units_w; + size_t oh_start = nh * OUTPUT_BLOCK_SIZE; + size_t ow_start = nw * OUTPUT_BLOCK_SIZE; + DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED( + megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23, cb, + dt_qint32, dt_qint8, bmode, nonline_mode, + output_transform_buf, bias, output, transform_mid_buf, + oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, + unit_idx, nr_units_in_tile, src_dtype, filter_dtype, + dst_dtype); + } + } +#undef cb +} + +} // namespace winograd +} // namespace arm_common +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7ca25e1d04d476f54bfb0dc5e259f3d9bd2a5965 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp @@ -0,0 +1,404 @@ +/** + * \file dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/fallback/conv_bias/winograd/winograd.h" +#include "src/naive/matrix_mul/matrix_mul_helper.h" + +#include "src/arm_common/conv_bias/winograd_common/winograd_common.h" +#include "src/arm_common/elemwise_helper/op_unary.h" +#include "src/arm_common/conv_bias/int8/strategy.h" +#include "src/arm_common/conv_bias/int8/helper.h" +#include "src/arm_common/simd_macro/marm_neon.h" +#include "src/arm_common/utils.h" + +#include "src/common/winograd/winograd_generator.h" +#include "src/common/unroll_macro.h" +#include "src/common/utils.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_int16_8x8) + +using namespace megdnn; +using namespace arm_common; + +namespace { + +struct FilterTransform2X3_qs8 { + static void transform(const int8_t* filter_ptr, int16_t* filter_transform_buf, + int16_t* transform_mid_buf, size_t OC, size_t IC, + size_t oc_start, size_t oc_end) { + constexpr int alpha = 2 + 3 - 1; + + /** + * origin: (4x3) * (3 x 3) * (3 x 4) + */ + //! 1 0 0 v00 v01 v02 1 0.5 0.5 0 + //! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 + //! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 + //! 0 0 1 + + //! 2 0 0 v00 v01 v02 2 1 1 0 + //! 1 1 1 v10 v11 v12 0 1 -1 0 + //! 1 -1 1 v20 v21 v22 0 1 1 2 + //! 0 0 2 + //! G * g * GT + + InputGetter getter; + MEGDNN_MARK_USED_VAR(transform_mid_buf); + megdnn_assert( + (oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 && + oc_end % 4 == 0 && IC % 8 == 0 && OC % 8 == 0, + "Winograd filter transform input param is not times of 8!"); + size_t OCB = OC / 8; + size_t ICB = IC / 8; + size_t ICB4 = IC / 4; + for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) { + size_t tmp_ocb = ocb / 2; + size_t index = ((ocb & 1) == 0) ? 0 : 1; + for (size_t icb = 0; icb < ICB4; icb++) { + for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) { + const int8_t* fptr = filter_ptr + + (ocb * ICB4 + icb) * 3 * 3 * 4 * 4 + + ic_inner * 4; +#define cb(m, n) \ + Vector g##m##n = \ + Vector(getter(fptr + (m * 3 + n) * 4 * 4)); + + UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) +#undef cb + +#define FILTER_TRANSFORM(n, wd, g) \ + auto wd##n##0 = g##0##n * 2; \ + v_tmp = g##0##n + g##2##n; \ + auto wd##n##1 = v_tmp + g##1##n; \ + auto wd##n##2 = v_tmp - g##1##n; \ + auto wd##n##3 = g##2##n * 2; + Vector v_tmp; + UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); + UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); +#undef FILTER_TRANSFORM + +#define cb(m, n) \ + ret##m##n.save( \ + filter_transform_buf + (m * alpha + n) * OCB * ICB * 8 * 8 + \ + tmp_ocb * ICB * 8 * 8 + icb * 4 * 8 + ic_inner * 8 + index * 4); + + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) +#undef cb + } + } + } + } +}; + +struct InputTransform2X3_qs8 { + template + static void prepare(const int8_t* input, int16_t* patch, int16_t* patchT, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t ic, size_t IC, size_t PH, size_t PW) { + megdnn_assert(ic % 8 == 0 && IC % 8 == 0, + "Winograd input prepare param is not times of 4!"); + MEGDNN_MARK_USED_VAR(patch); + constexpr size_t alpha = 2 + 3 - 1; + if (inner) { + const int8_t* input_ptr = + input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4; + for (size_t ico = 0; ico < alpha; ++ico) { + int8x16_t v_input0 = vld1q_s8(input_ptr); // c0123 + int8x16_t v_input1 = + vld1q_s8(input_ptr + IH * IW * 4); // c4567 + int32x4_t v32_00 = vreinterpretq_s32_s8(v_input0); + int32x4_t v32_01 = vreinterpretq_s32_s8(v_input1); + + int32x4x2_t v_trn = vtrnq_s32(v32_00, v32_01); // c01234567 + + v_input0 = vreinterpretq_s8_s32(v_trn.val[0]); + v_input1 = vreinterpretq_s8_s32(v_trn.val[1]); + + int16x8_t v0_low = vmovl_s8(vget_low_s8(v_input0)); + int16x8_t v0_high = vmovl_s8(vget_high_s8(v_input0)); + int16x8_t v1_low = vmovl_s8(vget_low_s8(v_input1)); + int16x8_t v1_high = vmovl_s8(vget_high_s8(v_input1)); + + vst1q_s16(patchT + ico * 8 * alpha + 0 * 8, v0_low); + vst1q_s16(patchT + ico * 8 * alpha + 1 * 8, v1_low); + vst1q_s16(patchT + ico * 8 * alpha + 2 * 8, v0_high); + vst1q_s16(patchT + ico * 8 * alpha + 3 * 8, v1_high); + input_ptr += IW * 4; // next row + } + } else { + if (PH || PW) { + memset(patchT, 0, sizeof(int16_t) * 8 * alpha * alpha); + } + InputGetter getter; + const int8_t* input_ptr = input + ic * IH * IW; + int ih0_act = std::max(ih_start, 0), + ih1_act = std::min(ih_start + alpha, IH), + iw0_act = std::max(iw_start, 0), + iw1_act = std::min(iw_start + alpha, IW); + // partial copy + for (int ih = ih0_act; ih < ih1_act; ++ih) { + for (int iw = iw0_act; iw < iw1_act; ++iw) { + size_t iho = ih - ih_start, iwo = iw - iw_start; + vst1q_s16(patchT + iho * alpha * 8 + iwo * 8, + vcombine_s16( + getter(input_ptr + ih * IW * 4 + iw * 4), + getter(input_ptr + IH * IW * 4 + + ih * IW * 4 + iw * 4))); + } + } + } + } + + static void transform(const int16_t* patchT, int16_t* input_transform_buf, + size_t unit_idx, size_t nr_units_in_tile, size_t ic, + size_t IC) { + constexpr size_t alpha = 2 + 3 - 1; + // BT * d * B +#define cb(m, n) \ + Vector d##m##n = \ + Vector::load(patchT + m * 4 * 8 + n * 8); + + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); +#undef cb + + //! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 + //! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 + //! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 + //! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 +#define cb(m) \ + auto t0##m = d0##m - d2##m; \ + auto t1##m = d1##m + d2##m; \ + auto t2##m = d2##m - d1##m; \ + auto t3##m = d3##m - d1##m; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + +#define cb(m) \ + d##m##0 = t##m##0 - t##m##2; \ + d##m##1 = t##m##1 + t##m##2; \ + d##m##2 = t##m##2 - t##m##1; \ + d##m##3 = t##m##3 - t##m##1; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + + size_t ICB = IC / 8; + size_t icb = ic / 8; +#define cb(m, n) \ + d##m##n.save(input_transform_buf + \ + (m * alpha + n) * ICB * nr_units_in_tile * 8 + \ + icb * nr_units_in_tile * 8 + unit_idx * 8); + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) +#undef cb + } +}; + +template +struct OutputTransform2X3_qs8 { + static void transform(const int32_t* output_transform_buf, + const int32_t* bias, int8_t* output, + int32_t* transform_mid_buf, size_t oh_start, + size_t ow_start, size_t OH, size_t OW, + size_t oc_start, size_t oc_end, size_t oc_index, + size_t unit_idx, size_t nr_units_in_tile, + const DType& src_dtype, const DType& filter_dtype, + const DType& dst_dtype) { + MEGDNN_MARK_USED_VAR(transform_mid_buf); + float scale_filter = 0.f; + if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) { + scale_filter = filter_dtype.param().scale; + } else { + megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS16); + scale_filter = filter_dtype.param().scale; + } + float input_filter_scale = + src_dtype.param().scale * scale_filter; + DType buffer_dtype = dtype::QuantizedS32(input_filter_scale * 0.5f * + 0.5f * 1.0f * 1.0f); + Op op(buffer_dtype, dst_dtype); + //! AT * m * A + constexpr size_t alpha = 2 + 3 - 1; + + size_t oc = oc_start + oc_index; + size_t OCB = (oc_end - oc_start) / 8; + size_t ocb = oc_index / 8; + +#define cb(m, n) \ + auto v##m##n = Vector::load( \ + output_transform_buf + \ + (m * alpha + n) * OCB * nr_units_in_tile * 8 + \ + ocb * nr_units_in_tile * 8 + unit_idx * 8); + UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); +#undef cb + //! 1 1 1 0 v00 v01 v02 v03 1 0 + //! 0 1 -1 1 v10 v11 v12 v13 1 1 + //! v20 v21 v22 v23 1 -1 + //! v30 v31 v32 v33 0 1 +#define cb(m) \ + auto t0##m = v0##m + v1##m + v2##m; \ + auto t1##m = v1##m - v2##m + v3##m; + + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb + + + Vector result[2][2]; + + result[0][0] = t00 + t01 + t02; + result[1][0] = t10 + t11 + t12; + result[0][1] = t01 - t02 + t03; + result[1][1] = t11 - t12 + t13; + + if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { + Vector vbias; + vbias = Vector::load(bias + oc) * 4; + + result[0][0] += vbias; + result[0][1] += vbias; + result[1][0] += vbias; + result[1][1] += vbias; + } + +#if MEGDNN_AARCH64 + int32_t* tmp_output = static_cast(static_cast(output)); +#endif + for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) { + for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) { + size_t oh = oh_start + oho; + size_t ow = ow_start + owo; + Vector res = result[oho][owo]; + if (bmode == BiasMode::BIAS) { + int32x4x2_t vbias; + vbias.val[0] = vld1q_s32(bias + oc * OH * OW + oh * OW * 4 + + ow * 4); + vbias.val[1] = vld1q_s32(bias + (oc + 4) * OH * OW + + oh * OW * 4 + ow * 4); + res += Vector(vbias) * 4; + } +#if MEGDNN_AARCH64 + int8x8_t res_int8 = op(res.value); + int32x2_t res32 = vreinterpret_s32_s8(res_int8); + tmp_output[oc / 4 * OH * OW + oh * OW + ow] = + vget_lane_s32(res32, 0); + tmp_output[(oc / 4 + 1) * OH * OW + oh * OW + ow] = + vget_lane_s32(res32, 1); +#else + + dt_qint8 res_int8 = dt_qint8(0); +#define cb(i) \ + res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[0], i))); \ + output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); \ + res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[1], i))); \ + output[(oc + 4) * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); + UNROLL_CALL_NOWRAPPER(4, cb); +#undef cb +#endif + } + } +#undef cb + } +}; +} // namespace + +namespace megdnn { +namespace arm_common { +namespace winograd { + +MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_8x8_s8_nchw44) + +void winograd_2x3_8x8_s8_nchw44::filter(const int8_t* filter, + int16_t* filter_transform_buf, + int16_t* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end) { + FilterTransform2X3_qs8::transform(filter, filter_transform_buf, + transform_mid_buf, OC, IC, oc_start, + oc_end); +} + +void winograd_2x3_8x8_s8_nchw44::input(const int8_t* input, + int16_t* input_transform_buf, + int16_t* transform_mid_buf, size_t IH, + size_t IW, size_t IC, size_t PH, size_t PW, + size_t unit_start_idx, + size_t nr_units_in_tile) { + megdnn_assert(IC % 8 == 0); + constexpr int alpha = 3 + 2 - 1; + auto units_w = div_ceil(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); + int16_t* patch = transform_mid_buf; + int16_t* patchT = transform_mid_buf;// + 8 * alpha * alpha; + + for (size_t ic = 0; ic < IC; ic += 8) { + rep(unit_idx, nr_units_in_tile) { + size_t index = unit_start_idx + unit_idx; + size_t nh = index / units_w; + size_t nw = index % units_w; + int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; + int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; + if (ih_start >= 0 && ih_start + alpha <= static_cast(IH) && + iw_start >= 0 && iw_start + alpha <= static_cast(IW)) { + InputTransform2X3_qs8::prepare(input, patch, patchT, + ih_start, iw_start, IH, IW, + ic, IC,PH,PW); + InputTransform2X3_qs8::transform(patchT, input_transform_buf, + unit_idx, nr_units_in_tile, ic, + IC); + + } else { + InputTransform2X3_qs8::prepare(input, patch, patchT, + ih_start, iw_start, IH, + IW, ic, IC,PH,PW); + InputTransform2X3_qs8::transform(patchT, input_transform_buf, + unit_idx, nr_units_in_tile, ic, + IC); + } + } + } +} + +void winograd_2x3_8x8_s8_nchw44::output(const int* output_transform_buf, + const int* bias, int8_t* output, + int* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t OH, size_t OW, + size_t oc_start, size_t oc_end, + size_t unit_start_idx, + size_t nr_units_in_tile) { +#define cb(_bmode, _nonline_op, ...) \ + OutputTransform2X3_qs8<_bmode MEGDNN_COMMA _nonline_op>::transform( \ + __VA_ARGS__); + auto units_w = div_ceil(OW, OUTPUT_BLOCK_SIZE); + for (size_t oc = oc_start; oc < oc_end; oc += 8) { + size_t oc_index = oc - oc_start; + rep(unit_idx, nr_units_in_tile) { + size_t index = unit_start_idx + unit_idx; + auto nh = index / units_w; + auto nw = index % units_w; + size_t oh_start = nh * OUTPUT_BLOCK_SIZE; + size_t ow_start = nw * OUTPUT_BLOCK_SIZE; + DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED( + megdnn_arm_common_winograd_nchw44_s8_int16_8x8, cb, + dt_qint32, dt_qint8, bmode, nonline_mode, + output_transform_buf, bias, output, transform_mid_buf, + oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, + unit_idx, nr_units_in_tile, src_dtype, filter_dtype, + dst_dtype); + } + } +#undef cb +} + +} // namespace winograd +} // namespace arm_common +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/opr_impl.cpp b/dnn/src/arm_common/conv_bias/opr_impl.cpp index a1bab1b2d954aa0ed242a4a875ce4c788248becc..f8fd4f204f4f85f5baad3468e7bef2acab51fcaf 100644 --- a/dnn/src/arm_common/conv_bias/opr_impl.cpp +++ b/dnn/src/arm_common/conv_bias/opr_impl.cpp @@ -201,6 +201,14 @@ public: static_cast(algo), tile_size)); winograd_algos.emplace_back(refhold.back().get()); + refhold.emplace_back(new AlgoS8CF32WinogradF23_4x4_NCHW44( + static_cast(algo), + tile_size)); + winograd_algos.emplace_back(refhold.back().get()); + refhold.emplace_back(new AlgoS8WinogradF23_8x8_NCHW44( + static_cast(algo), + tile_size)); + winograd_algos.emplace_back(refhold.back().get()); } } } diff --git a/dnn/src/arm_common/conv_bias/opr_impl.h b/dnn/src/arm_common/conv_bias/opr_impl.h index e1451126ac8879ffd3e19e8118b594b0d8c2317f..1e92a0fb0605d29d3c168c6c19abee6cd4f08f52 100644 --- a/dnn/src/arm_common/conv_bias/opr_impl.h +++ b/dnn/src/arm_common/conv_bias/opr_impl.h @@ -79,6 +79,8 @@ private: class AlgoI8x8x16Stride2; class AlgoI8x8x16Stride2Filter2; class AlgoS8WinogradF23_8x8; + class AlgoS8CF32WinogradF23_4x4_NCHW44; + class AlgoS8WinogradF23_8x8_NCHW44; #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC class AlgoF16Direct; class AlgoF16DirectStride1; diff --git a/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h b/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h index d1e43466c4772d189ccc92d9f22a5365130fe226..345935a2d4712355108d72015f912e1d1d0601dc 100644 --- a/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h +++ b/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h @@ -37,6 +37,13 @@ struct InputGetter { return vget_low_u16(vmovl_u8(vld1_u8(ptr))) - zp; } }; + +template <> +struct InputGetter { + float32x4_t operator()(const int8_t* ptr) { + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vld1_s8(ptr))))); + } +}; } // namespace } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/elemwise_helper/kimpl/relu.h b/dnn/src/arm_common/elemwise_helper/kimpl/relu.h index 76949ddbe44e12d50778a2659de29856bbe39f3f..cd52b5ba77e65cd9c70532e8ddd8b84f0b52d9e8 100644 --- a/dnn/src/arm_common/elemwise_helper/kimpl/relu.h +++ b/dnn/src/arm_common/elemwise_helper/kimpl/relu.h @@ -189,6 +189,11 @@ struct ReluOp : ReluOpBase { vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); return QConverter::convert(vitem0); } + int8x8_t operator()(const float32x4_t& src) const { + auto vitem0 = vmulq_f32(src, this->vscale); + vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); + return QConverter::convert(vitem0); + } }; #else template <> @@ -215,12 +220,25 @@ struct ReluOp : ReluOpBase, return vqmovn_s16(vcombine_s16(vqmovn_s32(vrshlq_s32(vitem0, vshift)), vqmovn_s32(vrshlq_s32(vitem1, vshift)))); } + int8x8_t operator()(const float32x4_t& vsrc) const { + int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(vsrc), vmultiplier); + vitem0 = vmaxq_s32(vitem0, QConverterBase::vzero()); + vitem0 = vrshlq_s32(vitem0, vshift); + int16x4_t vitem = vqmovn_s32(vitem0); + return vqmovn_s16(vcombine_s16(vitem, vitem)); + } void operator()(const int32x4_t& src, dt_qint8* dst) const { auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); auto result = QConverter::convert(vitem0); vst1_lane_s32(reinterpret_cast(dst), (int32x2_t)result, 0); } + void operator()(const float32x4_t& src, dt_qint8* dst) const { + auto vitem0 = vmulq_f32(src, this->vscale); + vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); + auto result = QConverter::convert(vitem0); + vst1_lane_s32(reinterpret_cast(dst), (int32x2_t)result, 0); + } }; #endif diff --git a/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h b/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h index 3104b0eef29fece29383fd5aab1138a886e9a35c..c52a8c5db777bb1b77c1cd388e745dd777462712 100644 --- a/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h +++ b/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h @@ -50,6 +50,10 @@ struct TypeCvtOp : UnaryOpBase { auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); return QConverter::convert(vitem0); } + int8x8_t operator()(const float32x4_t& src) const { + auto vitem0 = vmulq_f32(src, this->vscale); + return QConverter::convert(vitem0); + } }; #else template <> @@ -95,6 +99,13 @@ struct TypeCvtOp : UnaryOpBase, int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift)); return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16)); } + int8x8_t operator()(const float32x4_t& src) const { + int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(src), vmultiplier); + auto fixup0 = vshrq_n_s32(vitem0, 31); + vitem0 = vqaddq_s32(vitem0, fixup0); + int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift)); + return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16)); + } }; #endif diff --git a/dnn/src/arm_common/utils.h b/dnn/src/arm_common/utils.h index 3b790fbe65b1414be3db05ee0e99abee8e7b96d9..d5d368fccbbe42d89a7858b3f2d9af6612b323a1 100644 --- a/dnn/src/arm_common/utils.h +++ b/dnn/src/arm_common/utils.h @@ -369,6 +369,72 @@ struct Vector { } }; +template <> +struct Vector { + int16x4_t value; + Vector() {} + Vector(const int16_t v) { value = vdup_n_s16(v); } + Vector(const Vector& lr) { value = lr.value; } + Vector(const Vector&& lr) { value = std::move(lr.value); } + Vector(const int16x4_t& v) { value = v; } + static Vector load(const int16_t* addr) { + Vector v; + v.value = vld1_s16(addr); + return v; + } + static void save(int16_t* addr, const Vector& v) { + vst1_s16(addr, v.value); + } + void save(int16_t* addr) { save(addr, *this); } + Vector operator+(const Vector& lr) { + Vector dst; + dst.value = vadd_s16(value, lr.value); + return dst; + } + Vector& operator+=(const Vector& lr) { + value = vadd_s16(value, lr.value); + return *this; + } + Vector operator-(const Vector& lr) { + Vector dst; + dst.value = vsub_s16(value, lr.value); + return dst; + } + Vector& operator-=(const Vector& lr) { + value = vsub_s16(value, lr.value); + return *this; + } + Vector operator*(int16_t lr) { + Vector dst; + dst.value = vmul_n_s16(value, lr); + return dst; + } + Vector operator*(const Vector& lr) { + Vector dst; + dst.value = vmul_s16(value, lr.value); + return dst; + } + Vector& operator*=(const Vector& lr) { + value = vmul_s16(value, lr.value); + return *this; + } + Vector& operator=(const Vector& lr) { + value = lr.value; + return *this; + } + Vector& operator=(const Vector&& lr) { + value = std::move(lr.value); + return *this; + } + Vector operator-() { + Vector dst; + dst.value = -value; + return dst; + } +}; + + + template <> struct Vector { int32x4x2_t value; diff --git a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp index 3842a65013bcb8c4af79fe51e25bb349a192478e..08f6d3d9bdd9afcb4bb7558d69a968fbbdd7dbaa 100644 --- a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp +++ b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp @@ -109,12 +109,36 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, } if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { const dt_int8* src_ptr = src.compatible_ptr(); - dt_int16* dst_ptr = dst.compatible_ptr(); - dt_int16* workspace_ptr = workspace.ptr(); - if (FW == 3) { - if (m == 2) { - DISPATCH(winograd_2x3_8x8_s8, param::Winograd::Format::MK8, 1, - 0); + if (param().compute_mode == param::ConvBias::ComputeMode::DEFAULT) { + dt_int16* dst_ptr = dst.compatible_ptr(); + dt_int16* workspace_ptr = workspace.ptr(); + if (FW == 3) { + if (m == 2) { + if (pack_c_size == 1) { + DISPATCH(winograd_2x3_8x8_s8, + param::Winograd::Format::MK8, 1, 0); + } else if (pack_c_size == 4) { + DISPATCH(winograd_2x3_8x8_s8_nchw44, + param::Winograd::Format::MK8, 1, 0); + }else{ + megdnn_throw("only support pack_c_size = 1 or 4"); + } + } + } + } else { + dt_int32* dst_ptr_tmp = dst.compatible_ptr(); + dt_int32* workspace_ptr_tmp = workspace.ptr(); + float* dst_ptr = reinterpret_cast(dst_ptr_tmp); + float* workspace_ptr = reinterpret_cast(workspace_ptr_tmp); + if (pack_c_size == 4) { + if (FW == 3) { + if (m == 2) { + DISPATCH(winograd_2x3_4x4_s8_f32_nchw44, + param::Winograd::Format::MK4, 1, 1); + } + } + } else { + megdnn_throw("only support pack_c_size == 4"); } } } diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp index af5d6caa5f9d1dfeb9783010183024ddfeccf42e..c486a119320f45a4c1ec255f05e6e76b01d0cbd4 100644 --- a/dnn/src/common/conv_bias.cpp +++ b/dnn/src/common/conv_bias.cpp @@ -37,7 +37,9 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( param().format == param::ConvBias::Format::NCHW88_WINOGRAD || param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && src.dtype.category() == DTypeCategory::QUANTIZED) { - megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16); + megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16 || + //!int8 winogradf23_44 using float,QuantizedS32 take the scale + filter.dtype.enumv() == DTypeEnum::QuantizedS32); megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || src.dtype.enumv() == DTypeEnum::Quantized8Asymm); } else { @@ -49,7 +51,12 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || param().format == param::ConvBias::Format::NCHW88_WINOGRAD || param().format == param::ConvBias::Format::NCHW44_WINOGRAD) { - scale_filter = filter.dtype.param().scale; + if (filter.dtype.enumv() == DTypeEnum::QuantizedS32) { + //!int8 winogradf23_44 using float,QuantizedS32 take the scale + scale_filter = filter.dtype.param().scale; + } else { + scale_filter = filter.dtype.param().scale; + } } else { scale_filter = filter.dtype.param().scale; } diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index e346d4e6449e94cab792b0304472a631232c5509..914d23515a6e90bd7098aad7532483b5a82572ac 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -312,11 +312,14 @@ void make_canonized_filter_meta_nchwxx( size_t img_ndim = 2; size_t flt_start = 0; size_t flt_spatial_start = 2; + size_t pack_c_size = 0; if (param.sparse == Param::Sparse::DENSE) { if (filter.ndim == img_ndim + 4) { // oihw8i8o case - megdnn_assert(filter[filter.ndim - 2] == pack_size && - filter[filter.ndim - 1] == pack_size, + megdnn_assert((filter[filter.ndim - 2] == pack_size && + filter[filter.ndim - 1] == pack_size) || + (filter[filter.ndim - 2] == 2 * pack_size && + filter[filter.ndim - 1] == 2 * pack_size), "last 2 dim of filter must be %zu, but got %zu, %zu", pack_size, filter[filter.ndim - 2], filter[filter.ndim - 1]); @@ -326,8 +329,14 @@ void make_canonized_filter_meta_nchwxx( param.format == Param::Format::NCHW44_WINOGRAD) { flt_start = 2; } - ret.ocpg = filter[flt_start] * pack_size; - ret.icpg = filter[flt_start + 1] * pack_size; + if (filter[filter.ndim - 2] == 2 * pack_size && + filter[filter.ndim - 1] == 2 * pack_size) { + pack_c_size = 2 * pack_size; + } else { + pack_c_size = pack_size; + } + ret.ocpg = filter[flt_start] * pack_c_size; + ret.icpg = filter[flt_start + 1] * pack_c_size; } else if (filter.ndim == img_ndim + 3) { // ohwi8o megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD, @@ -375,15 +384,23 @@ void make_canonized_filter_meta_nchwxx( "bad filter ndim for group convolution: " "spatial_ndim=%zu filter_ndim=%zu", img_ndim, filter.ndim); - megdnn_assert(filter[filter.ndim - 1] == pack_size && - filter[filter.ndim - 2] == pack_size, + megdnn_assert((filter[filter.ndim - 1] == pack_size && + filter[filter.ndim - 2] == pack_size) || + (filter[filter.ndim - 1] == 2 * pack_size && + filter[filter.ndim - 2] == 2 * pack_size), "last 2 dim of filter must be %zu, but got %zu, %zu", pack_size, filter[filter.ndim - 2], filter[filter.ndim - 1]); ret.group = filter[0]; - ret.ocpg = filter_oc * pack_size; - ret.icpg = filter_ic * pack_size; + if (filter[filter.ndim - 2] == 2 * pack_size && + filter[filter.ndim - 1] == 2 * pack_size) { + ret.ocpg = filter_oc * 2 * pack_size; + ret.icpg = filter_ic * 2 * pack_size; + } else { + ret.ocpg = filter_oc * pack_size; + ret.icpg = filter_ic * pack_size; + } } } ret.spatial_ndim = 2; @@ -596,8 +613,17 @@ void ConvolutionBase::check_or_deduce_dtype_fwd(DType src, } else if (src.enumv() == DTypeEnum::QuantizedS8 || src.enumv() == DTypeEnum::Quantized8Asymm || src.enumv() == DTypeEnum::Quantized4Asymm) { - supported_dst_dtype.push_back( - dtype::QuantizedS32(mul_scale(src, filter))); + //! Qint8 winograd compute with float, in order to bringing the filter + //! scale, here just use QuantizedS32 as filter type. + if (src.enumv() == DTypeEnum::QuantizedS8 && + filter.enumv() == DTypeEnum::QuantizedS32) { + supported_dst_dtype.push_back(dtype::QuantizedS32( + src.param().scale * + filter.param().scale)); + } else { + supported_dst_dtype.push_back( + dtype::QuantizedS32(mul_scale(src, filter))); + } if (dst.valid() && dst.enumv() == src.enumv()) { supported_dst_dtype.push_back(dst); } @@ -625,12 +651,13 @@ void ConvolutionBase::check_or_deduce_dtype_fwd(DType src, megdnn_assert(dst_supported, "unsupported Conv(%s, %s) -> %s", src.name(), filter.name(), dst.name()); } - megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 + megdnn_assert((param().compute_mode == Param::ComputeMode::FLOAT32 || + param().compute_mode == Param::ComputeMode::DEFAULT) #if !MEGDNN_DISABLE_FLOAT16 - || src.enumv() == DTypeEnum::Float16 - || src.enumv() == DTypeEnum::BFloat16 + || src.enumv() == DTypeEnum::Float16 || + src.enumv() == DTypeEnum::BFloat16 #endif - , + , "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " "input / output."); } @@ -645,10 +672,12 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, megdnn_assert_contiguous(src); megdnn_assert_contiguous(filter); megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); - if (param().format == Param::Format::NCHW_WINOGRAD && + if ((param().format == Param::Format::NCHW_WINOGRAD || + param().format == Param::Format::NCHW44_WINOGRAD) && src.dtype.category() == DTypeCategory::QUANTIZED) { - megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16, "%s", - errmsg().c_str()); + megdnn_assert((filter.dtype.enumv() == DTypeEnum::QuantizedS16 || + filter.dtype.enumv() == DTypeEnum::QuantizedS32), + "%s", errmsg().c_str()); megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || src.dtype.enumv() == DTypeEnum::Quantized8Asymm, "%s", errmsg().c_str()); @@ -741,14 +770,18 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, if (param().format == Param::Format::NCHW44 || param().format == Param::Format::NCHW44_DOT || param().format == Param::Format::NCHW44_WINOGRAD) { + //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul megdnn_assert((src.ndim == 4 && filter.ndim == 5 && filter[filter.ndim - 1] == 4) || (src.ndim == 5 && ((filter.ndim == 6 && - filter[filter.ndim - 1] == 4) || + (filter[filter.ndim - 1] == 4 || + filter[filter.ndim - 1] == 8)) || (filter.ndim == 7 && - filter[filter.ndim - 1] == 4 && - filter[filter.ndim - 2] == 4)) && + (filter[filter.ndim - 1] == 4 || + filter[filter.ndim - 1] == 8) && + (filter[filter.ndim - 2] == 4 || + filter[filter.ndim - 2] == 8))) && src[src.ndim - 1] == 4), "NCHW44 require src ndim is 5 and filter's ndim is 6 " ", and last shape two is 4 but got src %s, filter %s", diff --git a/dnn/src/common/winograd/winograd_helper.cpp b/dnn/src/common/winograd/winograd_helper.cpp index 10ba9af1541a625b3dcf01b98d181861a1d949cd..0e3cfa5b3e26cc421d5ed58e3b7ffa1f090fbf02 100644 --- a/dnn/src/common/winograd/winograd_helper.cpp +++ b/dnn/src/common/winograd/winograd_helper.cpp @@ -67,8 +67,8 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) { switch (layout) { case param::ConvBias::Format::NHWCD4: return 4; - case param::ConvBias::Format::NCHW4: case param::ConvBias::Format::NCHW44: + case param::ConvBias::Format::NCHW4: return 4; case param::ConvBias::Format::NCHW32: return 32; @@ -365,6 +365,7 @@ INST(uint8_t, uint8_t, int16_t, int) _output_compute_type, layout, param::MatrixMul::Format::MK4>; INST(float, float, float, float, param::ConvBias::Format::NCHW) INST(float, float, float, float, param::ConvBias::Format::NCHW44) +INST(int8_t, int8_t, float, float, param::ConvBias::Format::NCHW44) #undef INST #define INST(_ctype, _dst_type, _input_filter_compute_type, \ @@ -373,6 +374,7 @@ INST(float, float, float, float, param::ConvBias::Format::NCHW44) _ctype, _dst_type, _input_filter_compute_type, \ _output_compute_type, layout, param::MatrixMul::Format::MK8>; INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW) +INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW44) INST(float, float, float, float, param::ConvBias::Format::NCHW88) MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16, param::ConvBias::Format::NCHW)) diff --git a/dnn/src/common/winograd_filter_preprocess.cpp b/dnn/src/common/winograd_filter_preprocess.cpp index c3471ed99f144c00f9623c1313ff5abf123e3ae3..486aadb6694617d7f2a84f72955a2e14449a3cac 100644 --- a/dnn/src/common/winograd_filter_preprocess.cpp +++ b/dnn/src/common/winograd_filter_preprocess.cpp @@ -56,8 +56,16 @@ void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src, DType dst_type = src.dtype; if (src.dtype.category() == DTypeCategory::QUANTIZED) { megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); - dst_type = dtype::QuantizedS16( - src.dtype.param().scale); + if (param().compute_mode == + param::ConvBias::ComputeMode::DEFAULT) { + //! input int8 compute short + dst_type = dtype::QuantizedS16( + src.dtype.param().scale); + } else { + //! input int8 compute float32 + dst_type = dtype::QuantizedS32( + src.dtype.param().scale); + } } if (src.ndim == 4 || src.ndim == 6) { @@ -123,8 +131,16 @@ size_t WinogradFilterPreprocess::get_workspace_in_bytes( if (src.dtype.category() == DTypeCategory::QUANTIZED) { megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || src.dtype.enumv() == DTypeEnum::Quantized8Asymm); - output_compute_dtype = dtype::QuantizedS16( - src.dtype.param().scale); + if (param().compute_mode == + param::ConvBias::ComputeMode::DEFAULT) { + //! input int8 compute short + output_compute_dtype = dtype::QuantizedS16( + src.dtype.param().scale); + } else { + //! input int8 compute float32 + output_compute_dtype = dtype::QuantizedS32( + src.dtype.param().scale); + } } size_t FW = src[3]; diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index ffa2190d37e0f7e7b3fdd892967daacb13eea589..87abe1d374df5be5738d9a0a1395d3601c590aa4 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -118,6 +118,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, DISPATCH(QuantizedS8, QuantizedS32) DISPATCH(Quantized8Asymm, QuantizedS32) DISPATCH(Quantized4Asymm, QuantizedS32) + DISPATCH_RAW(QuantizedS8, QuantizedS32, QuantizedS32, FLOAT32, + (convolution::forward_bias)) #if !MEGDNN_DISABLE_FLOAT16 DISPATCH(Float16, Float16) DISPATCH_RAW(Float16, Float16, Float16, FLOAT32, diff --git a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp index 0ebf578d698d60a78a480a8e3b76787fa2297c48..26490bf153d7164240907a30ad131e660811a2ee 100644 --- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp +++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp @@ -171,7 +171,6 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, } } #undef cb -#undef DISPATCH_FORMAT_MK8 #undef DISPATCH_DTYPE } else if (pack_c_size == 4) { //! NCHW44 @@ -195,6 +194,15 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ + } \ + if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { \ + if (param().format == param::Winograd::Format::MK4) { \ + DISPATCH_KERNEL(dt_int8, dt_int8, dt_float32, dt_float32, \ + DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ + } else if (param().format == param::Winograd::Format::MK8) { \ + DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32, \ + DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 0); \ + } \ } if (FW == 3) { if (m == 2) { @@ -208,6 +216,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, } #undef cb #undef DISPATCH_FORMAT_MK8 +#undef DISPATCH_FORMAT_MK4 #undef DISPATCH_KERNEL #undef DISPATCH_DTYPE } diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp index 55c01ec6f4daeda4d64408d8cdf7fe6cd14d8b22..7651dd7ed94fb85a74dc576074121bc440d8f4ed 100644 --- a/dnn/test/arm_common/conv_bias.cpp +++ b/dnn/test/arm_common/conv_bias.cpp @@ -98,7 +98,8 @@ CB_TEST(H_SWISH); #if MEGDNN_WITH_BENCHMARK -static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { +static void benchmark_convbias(Handle* handle, std::string int_name, + std::string float_name, bool is_fp32 = false) { constexpr size_t RUNS = 30; Benchmarker benchmarker_int(handle); @@ -109,12 +110,12 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { .set_dtype(4, dtype::QuantizedS8(60.25)) .set_display(false); benchmarker_int.set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker("IM2COLMATMUL:.+")); + conv_bias::ConvBiasAlgoChecker(int_name.c_str())); Benchmarker benchmarker_float(handle); benchmarker_float.set_display(false).set_times(RUNS); benchmarker_float.set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker("IM2COLMATMUL:.+")); + conv_bias::ConvBiasAlgoChecker(float_name.c_str())); Benchmarker benchmarker_nchw44(handle); if (is_fp32) { @@ -233,13 +234,24 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { } } } + TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) { - benchmark_convbias(handle(), true); - benchmark_convbias(handle(), false); +#if MEGDNN_AARCH64 + benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", + "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true); +#else + benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384", + "IM2COLMATMUL:ARMV7_F32:192", true); +#endif } TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) { - benchmark_convbias(handle(), true); - benchmark_convbias(handle(), false); +#if MEGDNN_AARCH64 + benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", + "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true); +#else + benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", + "IM2COLMATMUL:ARMV7_F32:192", true); +#endif } #endif @@ -506,7 +518,7 @@ void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name, computations / used_im2col, used / used_im2col); } } - +#if MEGDNN_AARCH64 TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { printf("=========================compare " "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, " @@ -515,6 +527,7 @@ TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16", handle(), 3, 4); } +#endif TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) { constexpr size_t RUNS = 50; diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp index 0f41a0e309f8283e2d83dd4e7c8b4465aab91910..2ca1c830934940f456a50863a5f24e955c0370b0 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp @@ -989,7 +989,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( ssprintf("WINOGRAD:%s:8:2:32", matmul_name).c_str())); - std::vector args = get_winograd_mk_packed_args(8); std::vector quantized_args = get_quantized_winograd_mk_packed_args(8); UniformIntRNG int_rng{-50, 50}; @@ -999,6 +998,174 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); } + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) { + using namespace conv_bias; + + Checker checker(handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; +#else + const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); + + std::vector quantized_args = get_int8_nchw44_args (3,4); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), + dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); +} + + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) { + using namespace conv_bias; + + Checker checker(handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; +#else + const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); + + std::vector quantized_args = + get_int8_nchw44_args(3, 4, false, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), + dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) { + using namespace conv_bias; + + Checker checker(handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_F32_MK4_4x16"; +#else + const char* matmul_name = "ARMV7_F32_MK4_4x8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); + std::vector quantized_args = + get_int8_nchw44_args(3, 4, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), + dtype::QuantizedS8(0.01887994f), + dtype::QuantizedS32(0.41113496f * 0.01887994f), + dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) { + using namespace conv_bias; + + Checker checker(handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_F32_MK4_4x16"; +#else + const char* matmul_name = "ARMV7_F32_MK4_4x8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); + std::vector quantized_args = + get_int8_nchw44_args(3, 4, true, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), + dtype::QuantizedS8(0.01887994f), + dtype::QuantizedS32(0.41113496f * 0.01887994f), + dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon); +} + + #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) { using namespace conv_bias; diff --git a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp index e8accc27b0c28c6ebff386744eb95c020c33ca8a..945f6cc9370c0c4981022ffa6ea68c7fadc3dc7d 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp @@ -1185,6 +1185,197 @@ TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) { {1, {4}}, data_type); } +TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) { + constexpr size_t RUNS = 50; + + param::ConvBias param; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.sparse = param::ConvBias::Sparse::GROUP; + + std::vector, float>> + shapes_and_computation; + auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, + size_t FS, size_t group) { + SmallVector shapes{{N, IC, H, W}, + {group, OC / group, IC / group, FS, FS}, + {1, OC, 1, 1}, + {}, + {N, OC, H, W}}; + TensorShape dst{N, OC, H, W}; + float computations = + ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + + dst.total_nr_elems()) * + 1e-6; + shapes_and_computation.push_back(std::make_pair(shapes, computations)); + }; + + bench_case(1, 32, 32, 200, 200, 3, 4); + bench_case(1, 32, 32, 200, 200, 3, 1); + bench_case(1, 32, 32, 128, 128, 3, 4); + bench_case(1, 32, 32, 128, 128, 3, 1); + bench_case(1, 32, 32, 100, 100, 3, 4); + bench_case(1, 32, 32, 100, 100, 3, 1); + bench_case(1, 32, 32, 80, 80, 3, 4); + + bench_case(1, 512, 512, 14, 14, 3, 1); + bench_case(1, 512, 256, 14, 14, 3, 1); + bench_case(1, 512, 128, 14, 14, 3, 1); + bench_case(1, 512, 64, 14, 14, 3, 1); + + bench_case(1, 512, 512, 7, 7, 3, 1); + bench_case(1, 512, 256, 7, 7, 3, 1); + bench_case(1, 512, 128, 7, 7, 3, 1); + bench_case(1, 512, 64, 7, 7, 3, 1); + + std::string algo_name; +#if MEGDNN_AARCH64 + algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32"; +#else + algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32"; +#endif + + + std::vector data_type = {dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), + dtype::QuantizedS32(6.25f) ,dtype::QuantizedS8(60.25f) }; + printf("Benchmark WINOGRAD_IN8_MK8 algo\n"); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {4}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {7}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, + {1, {4}}, data_type); +} + +TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, + BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) { + constexpr size_t RUNS = 50; + + param::ConvBias param; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.sparse = param::ConvBias::Sparse::DENSE; + param.format = param::ConvBias::Format::NCHW44; + + std::vector, float>> + shapes_and_computation; + auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, + size_t FS, size_t group) { + SmallVector shapes{{N, IC / 4, H, W, 4}, + {OC / 4, IC / 4, FS, FS, 4, 4}, + {1, OC / 4, 1, 1, 4}, + {}, + {N, OC / 4, H, W, 4}}; + TensorShape dst{N, OC, H, W}; + float computations = + ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + + dst.total_nr_elems()) * + 1e-6; + shapes_and_computation.push_back(std::make_pair(shapes, computations)); + }; + + bench_case(1, 32, 32, 200, 200, 3, 1); + bench_case(1, 32, 32, 128, 128, 3, 1); + bench_case(1, 32, 32, 100, 100, 3, 1); + + bench_case(1, 512, 512, 14, 14, 3, 1); + bench_case(1, 512, 256, 14, 14, 3, 1); + bench_case(1, 512, 128, 14, 14, 3, 1); + bench_case(1, 512, 64, 14, 14, 3, 1); + + bench_case(1, 512, 512, 7, 7, 3, 1); + bench_case(1, 512, 256, 7, 7, 3, 1); + bench_case(1, 512, 128, 7, 7, 3, 1); + bench_case(1, 512, 64, 7, 7, 3, 1); + + std::string algo_name; +#if MEGDNN_AARCH64 + algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32"; +#else + algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32"; +#endif + + std::vector data_type = { + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)}; + printf("Benchmark WINOGRAD_INT8_MK8 algo\n"); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {4}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {7}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, + {1, {4}}, data_type); +} + +TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, + BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) { + constexpr size_t RUNS = 50; + + param::ConvBias param; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.sparse = param::ConvBias::Sparse::DENSE; // GROUP; + param.format = param::ConvBias::Format::NCHW44; + + std::vector, float>> + shapes_and_computation; + auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, + size_t FS, size_t group) { + SmallVector shapes{{N, IC / 4, H, W, 4}, + {OC / 4, IC / 4, FS, FS, 4, 4}, + {1, OC / 4, 1, 1, 4}, + {}, + {N, OC / 4, H, W, 4}}; + TensorShape dst{N, OC, H, W}; + float computations = + ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + + dst.total_nr_elems()) * + 1e-6; + shapes_and_computation.push_back(std::make_pair(shapes, computations)); + }; + + bench_case(1, 32, 32, 200, 200, 3, 1); + bench_case(1, 32, 32, 128, 128, 3, 1); + bench_case(1, 32, 32, 100, 100, 3, 1); + + bench_case(1, 512, 512, 14, 14, 3, 1); + bench_case(1, 512, 256, 14, 14, 3, 1); + bench_case(1, 512, 128, 14, 14, 3, 1); + bench_case(1, 512, 64, 14, 14, 3, 1); + + bench_case(1, 512, 512, 7, 7, 3, 1); + bench_case(1, 512, 256, 7, 7, 3, 1); + bench_case(1, 512, 128, 7, 7, 3, 1); + bench_case(1, 512, 64, 7, 7, 3, 1); + + std::string algo_name; +#if MEGDNN_AARCH64 + algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32"; +#else + algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32"; +#endif + + std::vector data_type = { + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)}; + printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n"); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {4}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, + {4, {4, 5, 6, 7}}, {1, {7}}, data_type); + benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, + {1, {4}}, data_type); +} + TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) { constexpr size_t RUNS = 50; diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 22fdf90af02d09215a2c8226d14273df4e982a4d..8bd3f5b01dfe1fcb4be273738f23d122bb464b56 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -9,6 +9,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "test/common/conv_bias.h" +#include "megdnn/opr_param_defs.h" #include "src/common/utils.h" #include "test/common/benchmarker.h" namespace megdnn { @@ -242,7 +243,8 @@ std::vector get_winograd_mk_packed_args(size_t pack_size) { return args; } -std::vector get_quantized_winograd_mk_packed_args(size_t pack_size) { +std::vector get_quantized_winograd_mk_packed_args( + size_t pack_size, bool compute_float32) { std::vector args; param::ConvBias cur_param; @@ -260,13 +262,16 @@ std::vector get_quantized_winograd_mk_packed_args(size_t pack_size) { cur_param.sparse = param::ConvBias::Sparse::DENSE; cur_param.pad_h = cur_param.pad_w = 1; + if(compute_float32){ + cur_param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; + } + args.emplace_back(cur_param, TensorShape{1, pack_size, 3, 3}, TensorShape{pack_size, pack_size, 3, 3}, TensorShape{1, pack_size, 1, 1}); //! no bias args.emplace_back(cur_param, TensorShape{2, ic, i, i}, TensorShape{oc, ic, 3, 3}, TensorShape{}); - //! bias args.emplace_back(cur_param, TensorShape{2, ic, i, i}, TensorShape{oc, ic, 3, 3}, TensorShape{2, oc, i, i}); @@ -372,7 +377,7 @@ std::vector get_int8_nchw4_args(size_t kernel_size) { for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { for (size_t b : {64, 16}) { for (size_t ic : {16, 32}) { - for (size_t oc : {64, 32}) { + for (size_t oc : {16, 32}) { for (size_t h : {8}) { for (size_t w : {8, 11}) { for (int p : {0, static_cast(kernel_size / 2)}) { @@ -399,6 +404,95 @@ std::vector get_int8_nchw4_args(size_t kernel_size) { return args; } +std::vector get_int8_nchw44_args(size_t kernel_size, size_t pack_size, + bool compute_float32, + bool group_mode) { + std::vector args; + param::ConvBias cur_param; + megdnn_assert(pack_size > 0, "not support pack_size"); + megdnn_assert(kernel_size > 0, "not support kernel_size"); + using NLMode = param::ConvBias::NonlineMode; + + //// clang-format off + for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) { + for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { + for (size_t b : {1,2}) { + for (size_t ic : {8,16}) { + for (size_t oc : {8,16}) { + for (size_t h : {9,23}) { + for (size_t w : {9,23}) { + for (int p : {0, static_cast(kernel_size / 2)}) { + for (size_t s : {1}) { + if (kernel_size == 7) { + b = std::min(b, 32_z); + } + size_t f = kernel_size; + cur_param.mode = mode; + cur_param.nonlineMode = nlmode; + if (pack_size == 4){ + cur_param.format = param::ConvBias::Format::NCHW44; + } else if(pack_size == 8){ + cur_param.format = param::ConvBias::Format::NCHW88; + } + + if(compute_float32){ + cur_param.compute_mode = + param::ConvBias::ComputeMode::FLOAT32; + } + + cur_param.sparse = param::ConvBias::Sparse::DENSE; + cur_param.pad_h = cur_param.pad_w = p; + cur_param.stride_h = cur_param.stride_w = s; + if (!group_mode) { + //! no bias + args.emplace_back(cur_param, + TensorShape{b, ic / pack_size, h, w, pack_size}, + TensorShape{oc / pack_size, ic / pack_size, f, f, + pack_size, pack_size}, + TensorShape{}); + + //! bias channel + args.emplace_back(cur_param, + TensorShape{b, ic / pack_size, h, w, pack_size}, + TensorShape{oc / pack_size, ic / pack_size, f, f, + pack_size, pack_size}, + TensorShape{1, oc / pack_size, 1, 1, pack_size}); + //! bias + args.emplace_back( + cur_param, TensorShape{b, ic / pack_size, h, w, pack_size}, + TensorShape{oc / pack_size, ic / pack_size, f, f, pack_size, + pack_size}, + TensorShape{b, oc / pack_size, (h - f + 2 * p) / s + 1, + (w - f + 2 * p) / s + 1, pack_size}); + } else { + cur_param.sparse = param::ConvBias::Sparse::GROUP; + args.emplace_back( + cur_param, + TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, + TensorShape{2, oc / pack_size, ic / pack_size, 3, 3, + pack_size, pack_size}, + TensorShape{2, 2 * oc / pack_size, (h - f + 2 * p) / s + 1, + (w - f + 2 * p) / s + 1, pack_size}); + + args.emplace_back( + cur_param, + TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, + TensorShape{2, oc / pack_size, ic / pack_size, f, f, + pack_size, pack_size}, + TensorShape{1, 2 * oc / pack_size, 1, 1, pack_size}); + args.emplace_back( + cur_param, + TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, + TensorShape{2, oc / pack_size, ic / pack_size, f, f, + pack_size, pack_size}, + TensorShape{}); + } + } } } } } } } } } + // clang-format on + + return args; +} + std::vector get_int8_nchw4_args_check_bounds(size_t kernel_size) { std::vector args; param::ConvBias cur_param; @@ -990,11 +1084,14 @@ void checker_conv_bias_int8x8x16(std::vector args, void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, param::ConvBias param, Handle* handle, param::MatrixMul::Format format) { - megdnn_assert(param.format == param::ConvBias::Format::NCHW); + megdnn_assert(param.format == param::ConvBias::Format::NCHW || + param.format == param::ConvBias::Format::NCHW44); auto winograd_preprocess_opr = handle->create_operator(); winograd_preprocess_opr->param().output_block_size = m; winograd_preprocess_opr->param().format = format; + winograd_preprocess_opr->param().compute_mode = + param.compute_mode; TensorLayout filter_transform_layout; winograd_preprocess_opr->deduce_layout(tensors[1].layout, filter_transform_layout); @@ -1004,7 +1101,12 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, auto conv_bias_opr = handle->create_operator(); conv_bias_opr->param() = param; - conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; + if (param.format == param::ConvBias::Format::NCHW) { + conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; + } else { + conv_bias_opr->param().format = + param::ConvBias::Format::NCHW44_WINOGRAD; + } conv_bias_opr->param().output_block_size = m; size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( tensors[0].layout, filter_transform_layout, tensors[2].layout, @@ -1021,7 +1123,6 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, wb.get_workspace(2)); conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], tensors[3], tensors[4], nullptr, wb.get_workspace(1)); - free(wb.ptr()); }; diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h index 9ef78d6dd14a165d6c0dd3c3df61723e41544fdb..e52815b5494015b8943348e916f866aec87f4e9d 100644 --- a/dnn/test/common/conv_bias.h +++ b/dnn/test/common/conv_bias.h @@ -36,7 +36,7 @@ std::vector get_chanwise_args(); std::vector get_winograd_args(size_t kernel_size); std::vector get_winograd_mk_packed_args(size_t pack_size = 4); std::vector get_quantized_winograd_mk_packed_args( - size_t pack_size = 4); + size_t pack_size = 4, bool compute_float32 = false); std::vector get_quantized_args_with_nlmode( param::ConvBias::NonlineMode nlmode); std::vector get_quantized_args(); @@ -55,6 +55,10 @@ std::vector get_int8_chwn4_args_small_batch(size_t kernel_size); std::vector get_int8_nchw4_tensorcore_args(size_t kernel_size); std::vector get_int8_chwn4_tensorcore_args(size_t kernel_size); +std::vector get_int8_nchw44_args(size_t kernel_size, size_t pack_size, + bool compute_float32 = false, + bool group_mode = false); + template using ConvBiasAlgoChecker = AlgoChecker; diff --git a/src/gopt/impl/weights_preprocess.cpp b/src/gopt/impl/weights_preprocess.cpp index 3f8c84f3f379f6f94fea6003061934918f6e231d..31606613dcfb6b6ef5980fbea4ff0d03e9fb8235 100644 --- a/src/gopt/impl/weights_preprocess.cpp +++ b/src/gopt/impl/weights_preprocess.cpp @@ -102,12 +102,28 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { opr::ConvBiasForward::get_matmul_format(winograd_param); winograd_preprocess_param.output_block_size = winograd_param.output_block_size; + + size_t pack_c_size = 1; + if (new_inp[0]->shape().ndim == 5) { + pack_c_size = new_inp[0]->layout().shape[4]; + } + + if (conv_bias_opr.input(0)->dtype().enumv() == + DTypeEnum::QuantizedS8 && + pack_c_size == 4 && + winograd_preprocess_param.format == + megdnn::param::MatrixMul::Format::MK4) { + winograd_preprocess_param.compute_mode = + megdnn::param::ConvBias::ComputeMode::FLOAT32; + } + auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make( new_inp[1], winograd_preprocess_param); mgb_assert(inputs.size() == 2 || inputs.size() == 3, "input size need to be 2/3, but got: %zu", inputs.size()); SymbolVar new_conv_bias_opr; + auto conv_bias_param = conv_bias_opr.param(); if (new_inp[0]->shape().ndim == 4) { conv_bias_param.format = @@ -126,6 +142,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { algo_name.c_str()); } } + conv_bias_param.output_block_size = winograd_param.output_block_size; if (inputs.size() == 2) { diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp index bf65ac8ee2291dd9f1ad293900a76ff3c6bcd12f..9f72c3f71fbd3cc29abe1dc2064b6926bd270301 100644 --- a/src/opr/impl/dnn/convolution.cpp +++ b/src/opr/impl/dnn/convolution.cpp @@ -1541,8 +1541,9 @@ void ConvBiasForward::check_winograd_param_valid( dtype.enumv() == DTypeEnum::QuantizedS8 || dtype.enumv() == DTypeEnum::Quantized8Asymm) && (param.channel_block_size == 1 || + param.channel_block_size == 4 || param.channel_block_size == 8), - "only support 1/8 for the channel_block_size of " + "only support 1/4/8 for the channel_block_size of " "winograd param, got %u", param.channel_block_size); }