提交 e4cc85e5 编写于 作者: M Megvii Engine Team

feat(fallback): move arm_common f32 convbias to fallback gi

GitOrigin-RevId: ccf8b589be5d9638a446fdd31df72eed9f79a863
上级 0f1afb09
......@@ -28,7 +28,6 @@
#include "include/megdnn/oprs/nn.h"
#include "src/arm_common/conv_bias/f16/algos.h"
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/int8/stride1.h"
#include "src/arm_common/conv_bias/int8/stride2.h"
#include "src/arm_common/conv_bias/quint8/stride1.h"
......@@ -69,14 +68,6 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoDotS8DirectNCHWNCHW44 ds8_direct_nchw_nchw44;
#endif
AlgoF32DirectNCHWNCHW44 f32_direct_stride2_nchw_nchw44;
AlgoF32ChannelWiseNCHW44 f32_chanel_wise_nchw44;
AlgoF32DirectNCHW44 f32_direct_nchw44;
AlgoF32Direct f32_direct;
AlgoF32DirectStride2 f32_direct_stride2;
AlgoF32DirectStride1 f32_direct_stride1;
AlgoI8x8x16Direct i8x8x16_direct;
AlgoI8x8x16Stride2 i8x8x16_stride2;
AlgoI8x8x16Stride2Filter2 i8x8x16_stride2_filter2;
......@@ -127,14 +118,6 @@ public:
m_direct_algos.emplace_back(&i8x8x16_stride2);
m_direct_algos.emplace_back(&i8x8x16_nchw_nchw44);
m_direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44);
m_direct_algos.emplace_back(&f32_chanel_wise_nchw44);
m_direct_algos.emplace_back(&f32_direct_nchw44);
m_direct_algos.emplace_back(&f32_direct_stride1);
m_direct_algos.emplace_back(&f32_direct_stride2);
m_direct_algos.emplace_back(&f32_direct);
static CpuOprDelegationStorage<2> storage;
auto matmul_opr = storage.get<MatrixMul, 0>();
using MatmulFormat = param::MatrixMul::Format;
......@@ -145,22 +128,6 @@ public:
if (is_fallback_or_naive(algo))
continue;
for (uint32_t tile_size : {16, 8, 24, 32}) {
refhold.emplace_back(new AlgoFP32WinogradF23_4x4(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF63_4x4(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
//! uncomment this when low precision mode is done
#if 0
refhold.emplace_back(new AlgoFP32WinogradF73_4x4_NCHW44(
......@@ -175,27 +142,6 @@ public:
m_winograd_algos.emplace_back(refhold.back().get());
}
}
matmul_algos = static_cast<arm_common::MatrixMulImpl*>(matmul_opr)
->select_algo_type(
{AlgoDataType::FLOAT32, MatmulFormat::DEFAULT});
for (auto&& algo : matmul_algos) {
if (is_fallback_or_naive(algo))
continue;
for (uint32_t tile_size : {16, 8, 24, 32}) {
refhold.emplace_back(new AlgoFP32WinogradF63(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF54(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
refhold.emplace_back(new AlgoFP32WinogradF45(
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
tile_size));
m_winograd_algos.emplace_back(refhold.back().get());
}
}
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
matmul_algos = static_cast<arm_common::MatrixMulImpl*>(matmul_opr)
......
......@@ -49,15 +49,6 @@ private:
class AlgoS8DirectNCHWNCHW44;
class AlgoQU8DirectStride1;
class AlgoQU8DirectStride2;
class AlgoFP32WinogradF23_4x4;
class AlgoFP32WinogradF63;
class AlgoFP32WinogradF63_4x4;
class AlgoFP32WinogradF54;
class AlgoFP32WinogradF45;
class AlgoFP32WinogradF23_4x4_NCHW44;
class AlgoFP32WinogradF63_4x4_NCHW44;
class AlgoFP32WinogradF73_4x4_NCHW44;
class AlgoS8ChanWiseStride1NCHW44;
class AlgoS8ChanWiseStride2NCHW44;
......@@ -78,12 +69,6 @@ private:
class AlgoDotS8Direct_NCHW44;
#endif
class AlgoF32Direct;
class AlgoF32DirectStride1;
class AlgoF32DirectStride2;
class AlgoF32DirectNCHWNCHW44;
class AlgoF32ChannelWiseNCHW44;
class AlgoF32DirectNCHW44;
class AlgoI8x8x16Direct;
class AlgoI8x8x16Stride2;
......
......@@ -10,6 +10,8 @@
*/
#pragma once
#include "megbrain_build_config.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"
......
/**
* \file dnn/src/fallback/conv_bias/gi/block_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/common/utils.h"
namespace megdnn {
namespace {
// block_helper is used to calculate oh block size
static inline int l2_block_helper(
const int nthread, const int amount, const int size_per_unit) {
//! TODO: opt config or dynamic config l2_cache_size for different ARCH
constexpr int l2_cache_size = 256 * 1024;
const int block_per_thread = div_ceil(amount, nthread);
const int best_block =
std::min(amount, (l2_cache_size + size_per_unit / 2) / size_per_unit);
const int max_block_num = div_ceil(block_per_thread, best_block);
const int min_block_num = std::max(max_block_num - 1, 1);
const int max_block = div_ceil(block_per_thread, max_block_num);
const int min_block = div_ceil(block_per_thread, min_block_num);
const int max_loss = std::abs(max_block_num * max_block - block_per_thread);
const int min_loss = std::abs(min_block_num * min_block - block_per_thread);
int block = max_loss > min_loss ? min_block : max_block;
return block;
}
} // namespace
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/conv_bias/fp32/algos.cpp
* \file dnn/src/fallback/conv_bias/gi/fp32/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,23 +10,22 @@
* implied.
*/
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/fp32/direct.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
#include "src/arm_common/conv_bias/fp32/strategy.h"
#include "src/arm_common/conv_bias/img2col_helper.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/fallback/conv_bias/gi/fp32/algos.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/direct/multi_thread_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct.h"
#include "src/fallback/conv_bias/gi/fp32/do_conv_stride1.h"
#include "src/fallback/conv_bias/gi/fp32/do_conv_stride2.h"
#include "src/fallback/conv_bias/gi/fp32/strategy.h"
#include "src/fallback/conv_bias/gi/postprocess_helper.h"
#include "midout.h"
MIDOUT_DECL(megdnn_arm_common_winograd_fp32)
MIDOUT_DECL(megdnn_fallback_winograd_fp32)
using namespace megdnn;
using namespace arm_common;
using namespace fallback;
/* ======================= AlgoFP32WinogradF23_4x4 ======================== */
......@@ -34,10 +33,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 0) {
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 0, 0) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
using Strategy = winograd::winograd_2x3_4x4_f;
using Strategy = winograd::winograd_gi_2x3_4x4_f;
using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
......@@ -62,8 +61,8 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
}
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF23_4x4, winograd::winograd_2x3_4x4_f,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::MK4);
AlgoFP32WinogradF23_4x4, winograd::winograd_gi_2x3_4x4_f,
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* ======================= AlgoFP32WinogradF63 ======================== */
......@@ -71,7 +70,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 0) {
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 1, 0) {
using Strategy = winograd::winograd_6x3_1x1_f;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
......@@ -95,7 +94,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63, winograd::winograd_6x3_1x1_f,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::DEFAULT);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::DEFAULT);
/* ======================= AlgoFP32WinogradF54 ======================== */
......@@ -103,7 +102,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 0) {
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 2, 0) {
using Strategy = winograd::winograd_5x4_1x1_f;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
......@@ -127,7 +126,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF54, winograd::winograd_5x4_1x1_f,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::DEFAULT);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::DEFAULT);
/* ======================= AlgoFP32WinogradF45 ======================== */
......@@ -135,7 +134,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 0) {
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 3, 0) {
using Strategy = winograd::winograd_4x5_1x1_f;
Strategy strategy(param.src_type, param.filter_type, param.dst_type);
auto&& matmul_param =
......@@ -159,7 +158,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF45, winograd::winograd_4x5_1x1_f,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::DEFAULT);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::DEFAULT);
/* ======================= AlgoFP32WinogradF63_4x4 ======================== */
......@@ -167,7 +166,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 0) {
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 4, 0) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
using Strategy = winograd::winograd_6x3_4x4_f;
......@@ -197,7 +196,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4, winograd::winograd_6x3_4x4_f,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::MK4);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */
......@@ -206,7 +205,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(
megdnn_arm_common_winograd_fp32,
megdnn_fallback_winograd_fp32,
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
......@@ -236,7 +235,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF23_4x4_NCHW44, winograd::winograd_F23_mk4_f_nchw44,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::MK4);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */
......@@ -245,7 +244,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable(
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MEGDNN_MARK_USED_VAR(param);
MIDOUT_BEGIN(
megdnn_arm_common_winograd_fp32,
megdnn_fallback_winograd_fp32,
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
......@@ -276,7 +275,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4_NCHW44, winograd::winograd_F63_mk4_f_nchw44,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::MK4);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */
......@@ -284,7 +283,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MIDOUT_BEGIN(
megdnn_arm_common_winograd_fp32,
megdnn_fallback_winograd_fp32,
midout_iv("AlgoFP32WinogradF73_4x4_NCHW44"_hash)) {
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
return false;
......@@ -314,14 +313,14 @@ bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable(
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF73_4x4_NCHW44, winograd::winograd_F73_mk4_f_nchw44,
megdnn_arm_common_winograd_fp32, param::MatrixMul::Format::MK4);
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4);
/* ===================== direct algo ===================== */
MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl);
MIDOUT_DECL(megdnn_fallback_conv_bias_f32_kimpl);
bool ConvBiasImpl::AlgoF32Direct::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 0) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 0, 0) {
auto&& fm = param.filter_meta;
auto FH = fm.spatial[0];
auto SH = fm.stride[0], SW = fm.stride[1];
......@@ -341,7 +340,7 @@ bool ConvBiasImpl::AlgoF32Direct::usable(
return false;
}
size_t ConvBiasImpl::AlgoF32Direct::get_workspace(const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 0, 1) {
bool large_group = param.filter_meta.group >= param.nr_threads;
auto wbundle = fallback::MultithreadDirectConvCommon<float, float>::get_bundle(
param, large_group);
......@@ -426,7 +425,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::dispatch_kerns(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 0, 1) {
return get_kimpls(param);
}
MIDOUT_END();
......@@ -435,7 +434,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::dispatch_kerns(
/* ===================== stride-1 algo ===================== */
bool ConvBiasImpl::AlgoF32DirectStride1::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 1, 1) {
auto&& fm = param.filter_meta;
auto FH = fm.spatial[0];
return param.filter_meta.format == param::ConvBias::Format::NCHW &&
......@@ -452,7 +451,7 @@ bool ConvBiasImpl::AlgoF32DirectStride1::usable(
size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 1, 1) {
bool large_group = param.filter_meta.group >= param.nr_threads;
auto bundle =
fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
......@@ -548,7 +547,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::dispatch_kerns(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 2) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 1, 2) {
return get_kimpls(param);
}
MIDOUT_END();
......@@ -559,7 +558,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::dispatch_
bool ConvBiasImpl::AlgoF32DirectStride2::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 0) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 2, 0) {
auto&& fm = param.filter_meta;
auto FH = fm.spatial[0];
return param.filter_meta.format == param::ConvBias::Format::NCHW &&
......@@ -575,7 +574,7 @@ bool ConvBiasImpl::AlgoF32DirectStride2::usable(
}
size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 2, 1) {
bool large_group = param.filter_meta.group >= param.nr_threads;
auto bundle =
fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
......@@ -670,7 +669,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::dispatch_kerns(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 2) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_f32_kimpl, 2, 2) {
return get_kimpls(param);
}
MIDOUT_END();
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/algos.h
* \file dnn/src/fallback/conv_bias/gi/fp32/algos.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -12,11 +12,11 @@
#pragma once
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"
namespace megdnn {
namespace arm_common {
namespace fallback {
class ConvBiasImpl::AlgoFP32WinogradF23_4x4 final : public AlgoBase {
public:
AlgoFP32WinogradF23_4x4(
......@@ -31,7 +31,7 @@ public:
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F23_4X4_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F23_4X4_FP32)
};
class ConvBiasImpl::AlgoFP32WinogradF63 final : public AlgoBase {
......@@ -50,7 +50,7 @@ public:
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F63_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_FP32)
};
class ConvBiasImpl::AlgoFP32WinogradF63_4x4 final : public AlgoBase {
......@@ -67,7 +67,7 @@ public:
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F63_4X4_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_FP32)
};
class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase {
......@@ -86,7 +86,7 @@ public:
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F54_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F54_FP32)
};
class ConvBiasImpl::AlgoFP32WinogradF45 final : public AlgoBase {
......@@ -105,7 +105,7 @@ public:
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F45_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F45_FP32)
};
//===================== NCHW44 Winograd Support =====================//
......@@ -124,7 +124,7 @@ public:
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F23_4X4_NCHW44_F32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32)
};
class ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44 final : public AlgoBase {
......@@ -142,7 +142,7 @@ public:
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F63_4X4_NCHW44_F32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32)
};
class ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44 final : public AlgoBase {
......@@ -160,7 +160,7 @@ public:
}
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_WINOGRAD_F73_4X4_NCHW44_F32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32)
};
// ================================================================= //
......@@ -180,7 +180,7 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_DIRECT_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_FP32)
};
class ConvBiasImpl::AlgoF32DirectStride1 final : public AlgoBase {
......@@ -199,7 +199,7 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_DIRECT_STRD1_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_STRD1_FP32)
};
class ConvBiasImpl::AlgoF32DirectStride2 final : public AlgoBase {
......@@ -218,7 +218,7 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_DIRECT_STRD2_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_STRD2_FP32)
};
class ConvBiasImpl::AlgoF32DirectNCHW44 final : public AlgoBase {
......@@ -238,7 +238,7 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_DIRECT_NCHW44_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW44_FP32)
};
class ConvBiasImpl::AlgoF32DirectNCHWNCHW44 final : public AlgoBase {
......@@ -258,7 +258,7 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_DIRECT_NCHW_NCHW44_FP32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_FP32)
};
class ConvBiasImpl::AlgoF32ChannelWiseNCHW44 final : public AlgoBase {
......@@ -277,10 +277,10 @@ public:
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(ARM_COMMON_CHWNWISE_NCHW44_F32)
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_CHWNWISE_NCHW44_F32)
};
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
#undef MEGDNN_WINOGRAD_ALGO_FUN_DECLARE
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,29 +10,22 @@
* implied.
*/
#include "src/arm_common/conv_bias/fp32/channel_wise_3x3_s1p1_nchw44_kern.h"
#include "src/arm_common/elemwise_helper/elemwise_op.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/arm_common/utils.h"
#include "src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/gi/utils.h"
#include "src/fallback/elemwise_helper/elemwise_op.h"
#pragma GCC diagnostic ignored "-Wunused-parameter"
using namespace megdnn;
using namespace arm_common;
using namespace fallback;
namespace {
#if defined(__ARM_FEATURE_FMA)
#define Vfmaq_f32(d, n, m) vfmaq_f32(d, n, m)
#else
#define Vfmaq_f32(d, n, m) vmlaq_f32(d, n, m)
#endif
template <int shift>
static inline void shift_src(float32x4_t rsrc[3][4]) {
float32x4_t t[4];
static inline void shift_src(GI_FLOAT32_t rsrc[3][4]) {
GI_FLOAT32_t t[4];
t[0] = rsrc[0][(shift + 0) % 4];
t[1] = rsrc[0][(shift + 1) % 4];
......@@ -63,9 +56,9 @@ static inline void shift_src(float32x4_t rsrc[3][4]) {
}
template <BiasMode bias_mode>
static inline float32x4_t load_bias(const float* bias, const float32x4_t& init) {
static inline GI_FLOAT32_t load_bias(const float* bias, const GI_FLOAT32_t& init) {
if (bias_mode == BiasMode::BIAS) {
return vld1q_f32(bias);
return GiLoadFloat32(bias);
} else {
return init;
}
......@@ -76,35 +69,35 @@ struct compute_element {
template <typename Op>
static inline void call(
const float*& src0, const float*& src1, const float*& src2, float*& dst,
const float*& bias, const float32x4_t& init, float32x4_t rsrc[3][4],
float32x4_t rfilter[3][3], const Op& op) {
const float*& bias, const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[3][4],
GI_FLOAT32_t rfilter[3][3], const Op& op) {
#define RSRC(i, j) rsrc[i][((j) + bw) % 4]
float32x4_t rdst = load_bias<bias_mode>(bias, init);
GI_FLOAT32_t rdst = load_bias<bias_mode>(bias, init);
if (has_top) {
RSRC(0, 3) = vld1q_f32(src0 + 8);
RSRC(0, 3) = GiLoadFloat32(src0 + 8);
}
{ RSRC(1, 3) = vld1q_f32(src1 + 8); }
{ RSRC(1, 3) = GiLoadFloat32(src1 + 8); }
if (has_bottom) {
RSRC(2, 3) = vld1q_f32(src2 + 8);
RSRC(2, 3) = GiLoadFloat32(src2 + 8);
}
if (has_top) {
rdst = Vfmaq_f32(rdst, RSRC(0, 0), rfilter[0][0]);
rdst = Vfmaq_f32(rdst, RSRC(0, 1), rfilter[0][1]);
rdst = Vfmaq_f32(rdst, RSRC(0, 2), rfilter[0][2]);
rdst = GiMlaqFloat32(rdst, RSRC(0, 0), rfilter[0][0]);
rdst = GiMlaqFloat32(rdst, RSRC(0, 1), rfilter[0][1]);
rdst = GiMlaqFloat32(rdst, RSRC(0, 2), rfilter[0][2]);
}
{
rdst = Vfmaq_f32(rdst, RSRC(1, 0), rfilter[1][0]);
rdst = Vfmaq_f32(rdst, RSRC(1, 1), rfilter[1][1]);
rdst = Vfmaq_f32(rdst, RSRC(1, 2), rfilter[1][2]);
rdst = GiMlaqFloat32(rdst, RSRC(1, 0), rfilter[1][0]);
rdst = GiMlaqFloat32(rdst, RSRC(1, 1), rfilter[1][1]);
rdst = GiMlaqFloat32(rdst, RSRC(1, 2), rfilter[1][2]);
}
if (has_bottom) {
rdst = Vfmaq_f32(rdst, RSRC(2, 0), rfilter[2][0]);
rdst = Vfmaq_f32(rdst, RSRC(2, 1), rfilter[2][1]);
rdst = Vfmaq_f32(rdst, RSRC(2, 2), rfilter[2][2]);
rdst = GiMlaqFloat32(rdst, RSRC(2, 0), rfilter[2][0]);
rdst = GiMlaqFloat32(rdst, RSRC(2, 1), rfilter[2][1]);
rdst = GiMlaqFloat32(rdst, RSRC(2, 2), rfilter[2][2]);
}
vst1q_f32(dst, op(rdst));
GiStoreFloat32(dst, op(rdst));
if (has_top) {
src0 += 4;
......@@ -131,27 +124,27 @@ template <bool has_top, bool has_bottom, BiasMode bias_mode>
struct compute_element_right {
template <typename Op>
static inline void call(
float*& dst, const float*& bias, const float32x4_t& init,
float32x4_t rsrc[3][4], float32x4_t rfilter[3][3], const Op& op) {
float32x4_t rdst = load_bias<bias_mode>(bias, init);
float*& dst, const float*& bias, const GI_FLOAT32_t& init,
GI_FLOAT32_t rsrc[3][4], GI_FLOAT32_t rfilter[3][3], const Op& op) {
GI_FLOAT32_t rdst = load_bias<bias_mode>(bias, init);
if (has_top) {
rdst = Vfmaq_f32(rdst, rsrc[0][0], rfilter[0][0]);
rdst = Vfmaq_f32(rdst, rsrc[0][1], rfilter[0][1]);
rdst = Vfmaq_f32(rdst, rsrc[0][2], rfilter[0][2]);
rdst = GiMlaqFloat32(rdst, rsrc[0][0], rfilter[0][0]);
rdst = GiMlaqFloat32(rdst, rsrc[0][1], rfilter[0][1]);
rdst = GiMlaqFloat32(rdst, rsrc[0][2], rfilter[0][2]);
}
{
rdst = Vfmaq_f32(rdst, rsrc[1][0], rfilter[1][0]);
rdst = Vfmaq_f32(rdst, rsrc[1][1], rfilter[1][1]);
rdst = Vfmaq_f32(rdst, rsrc[1][2], rfilter[1][2]);
rdst = GiMlaqFloat32(rdst, rsrc[1][0], rfilter[1][0]);
rdst = GiMlaqFloat32(rdst, rsrc[1][1], rfilter[1][1]);
rdst = GiMlaqFloat32(rdst, rsrc[1][2], rfilter[1][2]);
}
if (has_bottom) {
rdst = Vfmaq_f32(rdst, rsrc[2][0], rfilter[2][0]);
rdst = Vfmaq_f32(rdst, rsrc[2][1], rfilter[2][1]);
rdst = Vfmaq_f32(rdst, rsrc[2][2], rfilter[2][2]);
rdst = GiMlaqFloat32(rdst, rsrc[2][0], rfilter[2][0]);
rdst = GiMlaqFloat32(rdst, rsrc[2][1], rfilter[2][1]);
rdst = GiMlaqFloat32(rdst, rsrc[2][2], rfilter[2][2]);
}
vst1q_f32(dst, op(rdst));
GiStoreFloat32(dst, op(rdst));
dst += 4;
bias += 4;
......@@ -162,24 +155,24 @@ template <bool has_top, bool has_bottom, BiasMode bias_mode>
struct compute_element_right_pad {
template <typename Op>
static inline void call(
float*& dst, const float*& bias, const float32x4_t& init,
float32x4_t rsrc[3][4], float32x4_t rfilter[3][3], const Op& op) {
float32x4_t rdst = load_bias<bias_mode>(bias, init);
float*& dst, const float*& bias, const GI_FLOAT32_t& init,
GI_FLOAT32_t rsrc[3][4], GI_FLOAT32_t rfilter[3][3], const Op& op) {
GI_FLOAT32_t rdst = load_bias<bias_mode>(bias, init);
if (has_top) {
rdst = Vfmaq_f32(rdst, rsrc[0][1], rfilter[0][0]);
rdst = Vfmaq_f32(rdst, rsrc[0][2], rfilter[0][1]);
rdst = GiMlaqFloat32(rdst, rsrc[0][1], rfilter[0][0]);
rdst = GiMlaqFloat32(rdst, rsrc[0][2], rfilter[0][1]);
}
{
rdst = Vfmaq_f32(rdst, rsrc[1][1], rfilter[1][0]);
rdst = Vfmaq_f32(rdst, rsrc[1][2], rfilter[1][1]);
rdst = GiMlaqFloat32(rdst, rsrc[1][1], rfilter[1][0]);
rdst = GiMlaqFloat32(rdst, rsrc[1][2], rfilter[1][1]);
}
if (has_bottom) {
rdst = Vfmaq_f32(rdst, rsrc[2][1], rfilter[2][0]);
rdst = Vfmaq_f32(rdst, rsrc[2][2], rfilter[2][1]);
rdst = GiMlaqFloat32(rdst, rsrc[2][1], rfilter[2][0]);
rdst = GiMlaqFloat32(rdst, rsrc[2][2], rfilter[2][1]);
}
vst1q_f32(dst, op(rdst));
GiStoreFloat32(dst, op(rdst));
dst += 4;
bias += 4;
}
......@@ -190,22 +183,22 @@ struct compute_row {
template <typename Op>
static inline void call(
const float*& src0, const float*& src1, const float*& src2, float*& dst,
const float*& bias, const float32x4_t& init, float32x4_t rsrc[3][4],
float32x4_t rfilter[3][3], int W, const Op& op) {
const float*& bias, const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[3][4],
GI_FLOAT32_t rfilter[3][3], int W, const Op& op) {
if (has_top) {
rsrc[0][0] = vdupq_n_f32(0);
rsrc[0][1] = vld1q_f32(src0 + 0);
rsrc[0][2] = vld1q_f32(src0 + 4);
rsrc[0][0] = GiZeroFloat32();
rsrc[0][1] = GiLoadFloat32(src0 + 0);
rsrc[0][2] = GiLoadFloat32(src0 + 4);
}
{
rsrc[1][0] = vdupq_n_f32(0);
rsrc[1][1] = vld1q_f32(src1 + 0);
rsrc[1][2] = vld1q_f32(src1 + 4);
rsrc[1][0] = GiZeroFloat32();
rsrc[1][1] = GiLoadFloat32(src1 + 0);
rsrc[1][2] = GiLoadFloat32(src1 + 4);
}
if (has_bottom) {
rsrc[2][0] = vdupq_n_f32(0);
rsrc[2][1] = vld1q_f32(src2 + 0);
rsrc[2][2] = vld1q_f32(src2 + 4);
rsrc[2][0] = GiZeroFloat32();
rsrc[2][1] = GiLoadFloat32(src2 + 0);
rsrc[2][2] = GiLoadFloat32(src2 + 4);
}
int w = 0;
......@@ -256,27 +249,27 @@ void channel_wise_nchw44_float::do_conv_kern_3x3_stride1_padding1(
int W) {
Op op;
float32x4_t init = vdupq_n_f32(0);
GI_FLOAT32_t init = GiZeroFloat32();
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
init = vld1q_f32(bias);
init = GiLoadFloat32(bias);
}
const float* src0 = src - W * 4;
const float* src1 = src;
const float* src2 = src + W * 4;
float32x4_t rfilter[3][3];
rfilter[0][0] = vld1q_f32(filter + 0);
rfilter[0][1] = vld1q_f32(filter + 4);
rfilter[0][2] = vld1q_f32(filter + 8);
rfilter[1][0] = vld1q_f32(filter + 12);
rfilter[1][1] = vld1q_f32(filter + 16);
rfilter[1][2] = vld1q_f32(filter + 20);
rfilter[2][0] = vld1q_f32(filter + 24);
rfilter[2][1] = vld1q_f32(filter + 28);
rfilter[2][2] = vld1q_f32(filter + 32);
float32x4_t rsrc[3][4];
GI_FLOAT32_t rfilter[3][3];
rfilter[0][0] = GiLoadFloat32(filter + 0);
rfilter[0][1] = GiLoadFloat32(filter + 4);
rfilter[0][2] = GiLoadFloat32(filter + 8);
rfilter[1][0] = GiLoadFloat32(filter + 12);
rfilter[1][1] = GiLoadFloat32(filter + 16);
rfilter[1][2] = GiLoadFloat32(filter + 20);
rfilter[2][0] = GiLoadFloat32(filter + 24);
rfilter[2][1] = GiLoadFloat32(filter + 28);
rfilter[2][2] = GiLoadFloat32(filter + 32);
GI_FLOAT32_t rsrc[3][4];
compute_row<false, true, bias_mode>::call(
src0, src1, src2, dst, bias, init, rsrc, rfilter, W, op);
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_3x3_s1p1_nchw44_kern.h
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -12,11 +12,11 @@
#pragma once
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/opr_impl.h"
namespace megdnn {
namespace arm_common {
namespace fallback {
namespace channel_wise_nchw44_float {
template <BiasMode bias_mode, typename Op>
......@@ -25,7 +25,7 @@ void do_conv_kern_3x3_stride1_padding1(
int W);
} // namespace channel_wise_nchw44_float
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,29 +10,22 @@
* implied.
*/
#include "src/arm_common/conv_bias/fp32/channel_wise_5x5_s1p2_nchw44_kern.h"
#include "src/arm_common/elemwise_helper/elemwise_op.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/arm_common/utils.h"
#include "src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/gi/utils.h"
#include "src/fallback/elemwise_helper/elemwise_op.h"
#pragma GCC diagnostic ignored "-Wunused-parameter"
using namespace megdnn;
using namespace arm_common;
using namespace fallback;
namespace {
#if defined(__ARM_FEATURE_FMA)
#define Vfmaq_f32(d, n, m) vfmaq_f32(d, n, m)
#else
#define Vfmaq_f32(d, n, m) vmlaq_f32(d, n, m)
#endif
template <int shift>
static inline void shift_src(float32x4_t rsrc[6]) {
float32x4_t t[6];
static inline void shift_src(GI_FLOAT32_t rsrc[6]) {
GI_FLOAT32_t t[6];
t[0] = rsrc[(shift + 0) % 6];
t[1] = rsrc[(shift + 1) % 6];
......@@ -48,18 +41,18 @@ static inline void shift_src(float32x4_t rsrc[6]) {
rsrc[5] = t[5];
}
static inline void load_filter(const float* filter, float32x4_t rfilter[5]) {
rfilter[0] = vld1q_f32(filter + 0);
rfilter[1] = vld1q_f32(filter + 4);
rfilter[2] = vld1q_f32(filter + 8);
rfilter[3] = vld1q_f32(filter + 12);
rfilter[4] = vld1q_f32(filter + 16);
static inline void load_filter(const float* filter, GI_FLOAT32_t rfilter[5]) {
rfilter[0] = GiLoadFloat32(filter + 0);
rfilter[1] = GiLoadFloat32(filter + 4);
rfilter[2] = GiLoadFloat32(filter + 8);
rfilter[3] = GiLoadFloat32(filter + 12);
rfilter[4] = GiLoadFloat32(filter + 16);
}
template <BiasMode bias_mode>
static inline float32x4_t load_bias(const float* bias, const float32x4_t& init) {
static inline GI_FLOAT32_t load_bias(const float* bias, const GI_FLOAT32_t& init) {
if (bias_mode == BiasMode::BIAS) {
return vld1q_f32(bias);
return GiLoadFloat32(bias);
} else {
return init;
}
......@@ -69,27 +62,28 @@ template <int BW, int bw, BiasMode bias_mode, bool need_load_bias, bool need_do_
struct compute_element {
template <typename Op>
static inline void call(
const float*& src, float*& dst, const float*& bias, const float32x4_t& init,
float32x4_t rsrc[6], float32x4_t rfilter[5], const Op& op) {
const float*& src, float*& dst, const float*& bias,
const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5],
const Op& op) {
#define RSRC(i) rsrc[((i) + bw) % 6]
float32x4_t rdst;
GI_FLOAT32_t rdst;
if (need_load_bias) {
rdst = load_bias<bias_mode>(bias, init);
} else {
rdst = vld1q_f32(dst);
rdst = GiLoadFloat32(dst);
}
RSRC(5) = vld1q_f32(src + 12);
RSRC(5) = GiLoadFloat32(src + 12);
rdst = Vfmaq_f32(rdst, RSRC(0), rfilter[0]);
rdst = Vfmaq_f32(rdst, RSRC(1), rfilter[1]);
rdst = Vfmaq_f32(rdst, RSRC(2), rfilter[2]);
rdst = Vfmaq_f32(rdst, RSRC(3), rfilter[3]);
rdst = Vfmaq_f32(rdst, RSRC(4), rfilter[4]);
rdst = GiMlaqFloat32(rdst, RSRC(0), rfilter[0]);
rdst = GiMlaqFloat32(rdst, RSRC(1), rfilter[1]);
rdst = GiMlaqFloat32(rdst, RSRC(2), rfilter[2]);
rdst = GiMlaqFloat32(rdst, RSRC(3), rfilter[3]);
rdst = GiMlaqFloat32(rdst, RSRC(4), rfilter[4]);
if (need_do_op) {
rdst = op(rdst);
}
vst1q_f32(dst, rdst);
GiStoreFloat32(dst, rdst);
src += 4;
dst += 4;
......@@ -110,29 +104,29 @@ template <size_t padding, BiasMode bias_mode, bool need_load_bias, bool need_do_
struct compute_element_right {
template <typename Op>
static inline void call(
float*& dst, const float*& bias, const float32x4_t& init,
float32x4_t rsrc[6], float32x4_t rfilter[5], const Op& op) {
float32x4_t rdst;
float*& dst, const float*& bias, const GI_FLOAT32_t& init,
GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], const Op& op) {
GI_FLOAT32_t rdst;
if (need_load_bias) {
rdst = load_bias<bias_mode>(bias, init);
} else {
rdst = vld1q_f32(dst);
rdst = GiLoadFloat32(dst);
}
rdst = Vfmaq_f32(rdst, rsrc[0 + padding], rfilter[0]);
rdst = Vfmaq_f32(rdst, rsrc[1 + padding], rfilter[1]);
rdst = Vfmaq_f32(rdst, rsrc[2 + padding], rfilter[2]);
rdst = GiMlaqFloat32(rdst, rsrc[0 + padding], rfilter[0]);
rdst = GiMlaqFloat32(rdst, rsrc[1 + padding], rfilter[1]);
rdst = GiMlaqFloat32(rdst, rsrc[2 + padding], rfilter[2]);
if (padding < 2) {
rdst = Vfmaq_f32(rdst, rsrc[3 + padding], rfilter[3]);
rdst = GiMlaqFloat32(rdst, rsrc[3 + padding], rfilter[3]);
}
if (padding < 1) {
rdst = Vfmaq_f32(rdst, rsrc[4 + padding], rfilter[4]);
rdst = GiMlaqFloat32(rdst, rsrc[4 + padding], rfilter[4]);
}
if (need_do_op) {
rdst = op(rdst);
}
vst1q_f32(dst, rdst);
GiStoreFloat32(dst, rdst);
dst += 4;
bias += 4;
......@@ -143,13 +137,13 @@ template <BiasMode bias_mode, bool need_load_bias, bool need_do_op>
struct compute_row_src_1x5 {
template <typename Op>
static inline void call(
const float* src, float* dst, const float* bias, const float32x4_t& init,
float32x4_t rsrc[6], float32x4_t rfilter[5], int W, const Op& op) {
rsrc[0] = vdupq_n_f32(0);
rsrc[1] = vdupq_n_f32(0);
rsrc[2] = vld1q_f32(src + 0);
rsrc[3] = vld1q_f32(src + 4);
rsrc[4] = vld1q_f32(src + 8);
const float* src, float* dst, const float* bias, const GI_FLOAT32_t& init,
GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], int W, const Op& op) {
rsrc[0] = GiZeroFloat32();
rsrc[1] = GiZeroFloat32();
rsrc[2] = GiLoadFloat32(src + 0);
rsrc[3] = GiLoadFloat32(src + 4);
rsrc[4] = GiLoadFloat32(src + 8);
int w = 0;
......@@ -190,8 +184,8 @@ struct compute_row {
template <typename Op>
static inline void call(
const float*& src, float*& dst, const float* filter, const float*& bias,
const float32x4_t& init, float32x4_t rsrc[6], float32x4_t rfilter[5], int W,
const Op& op) {
const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5],
int W, const Op& op) {
if (top_padding < 1) {
load_filter(filter + 0, rfilter);
compute_row_src_1x5<bias_mode, top_padding == 0, false>::call(
......@@ -235,13 +229,13 @@ void channel_wise_nchw44_float::do_conv_kern_5x5_stride1_padding2(
int W) {
Op op;
float32x4_t init = vdupq_n_f32(0);
GI_FLOAT32_t init = GiZeroFloat32();
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
init = vld1q_f32(bias);
init = GiLoadFloat32(bias);
}
float32x4_t rsrc[6];
float32x4_t rfilter[5];
GI_FLOAT32_t rsrc[6];
GI_FLOAT32_t rfilter[5];
compute_row<2, 0, bias_mode>::call(
src, dst, filter, bias, init, rsrc, rfilter, W, op);
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_5x5_s1p2_nchw44_kern.h
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -12,11 +12,11 @@
#pragma once
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/opr_impl.h"
namespace megdnn {
namespace arm_common {
namespace fallback {
namespace channel_wise_nchw44_float {
template <BiasMode bias_mode, typename Op>
......@@ -25,7 +25,7 @@ void do_conv_kern_5x5_stride1_padding2(
int W);
} // namespace channel_wise_nchw44_float
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_nchw44_algo.cpp
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,14 +10,14 @@
* implied.
*/
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/fp32/channel_wise_nchw44_kern.h"
#include "src/arm_common/elemwise_helper/elemwise_op.h"
#include "src/fallback/conv_bias/gi/fp32/algos.h"
#include "src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.h"
#include "src/fallback/elemwise_helper/elemwise_op.h"
#include "midout.h"
using namespace megdnn;
using namespace arm_common;
using namespace fallback;
using conv_fun = std::function<void(
const float* src, const float* filter, const float* bias, float* dst,
const size_t IH, const size_t IW, const size_t OH, const size_t OW,
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/channel_wise_nchw44_kern.h
* \file dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -12,11 +12,11 @@
#pragma once
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/opr_impl.h"
namespace megdnn {
namespace arm_common {
namespace fallback {
namespace channel_wise_nchw44_float {
#define KERN(stride, i) \
......@@ -37,7 +37,7 @@ KERN(stride2, 5)
#undef KERN
} // namespace channel_wise_nchw44_float
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/conv_bias/fp32/direct.h
* \file dnn/src/fallback/conv_bias/gi/fp32/direct.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -13,7 +13,7 @@
#include <cstddef>
namespace megdnn {
namespace arm_common {
namespace fallback {
namespace fp32 {
namespace conv_bias {
......@@ -23,7 +23,7 @@ void kern_direct(
} // namespace conv_bias
} // namespace fp32
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -11,12 +11,12 @@
* implied.
*/
#include "src/arm_common/conv_bias/fp32/f32_direct_nchw44_kern.h"
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/fallback/conv_bias/gi/fp32/f32_direct_nchw44_kern.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/general_intrinsic/gi_float.h"
namespace megdnn {
namespace arm_common {
namespace fallback {
namespace conv_bias {
template <>
void pack_src_fp32_nchw44<1>(
......@@ -51,23 +51,23 @@ static inline void odd_even_split_iw8_even(
const int src_offset = src_idx * ic_step;
const int even_offset = iw_idx / 2 * ic_step;
const int odd_offset = (odd_start + iw_idx / 2) * ic_step;
float32x4_t temp[8];
temp[0] = vld1q_f32(sptr + src_offset + 0 * ic_step);
temp[1] = vld1q_f32(sptr + src_offset + 1 * ic_step);
temp[2] = vld1q_f32(sptr + src_offset + 2 * ic_step);
temp[3] = vld1q_f32(sptr + src_offset + 3 * ic_step);
temp[4] = vld1q_f32(sptr + src_offset + 4 * ic_step);
temp[5] = vld1q_f32(sptr + src_offset + 5 * ic_step);
temp[6] = vld1q_f32(sptr + src_offset + 6 * ic_step);
temp[7] = vld1q_f32(sptr + src_offset + 7 * ic_step);
vst1q_f32(sptr_base + even_offset + 0 * ic_step, temp[0]);
vst1q_f32(sptr_base + even_offset + 1 * ic_step, temp[2]);
vst1q_f32(sptr_base + even_offset + 2 * ic_step, temp[4]);
vst1q_f32(sptr_base + even_offset + 3 * ic_step, temp[6]);
vst1q_f32(sptr_base + odd_offset + 0 * ic_step, temp[1]);
vst1q_f32(sptr_base + odd_offset + 1 * ic_step, temp[3]);
vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[5]);
vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[7]);
GI_FLOAT32_t temp[8];
temp[0] = GiLoadFloat32(sptr + src_offset + 0 * ic_step);
temp[1] = GiLoadFloat32(sptr + src_offset + 1 * ic_step);
temp[2] = GiLoadFloat32(sptr + src_offset + 2 * ic_step);
temp[3] = GiLoadFloat32(sptr + src_offset + 3 * ic_step);
temp[4] = GiLoadFloat32(sptr + src_offset + 4 * ic_step);
temp[5] = GiLoadFloat32(sptr + src_offset + 5 * ic_step);
temp[6] = GiLoadFloat32(sptr + src_offset + 6 * ic_step);
temp[7] = GiLoadFloat32(sptr + src_offset + 7 * ic_step);
GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, temp[0]);
GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, temp[2]);
GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, temp[4]);
GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, temp[6]);
GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, temp[1]);
GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, temp[3]);
GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, temp[5]);
GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, temp[7]);
}
static inline void odd_even_split_iw8_odd(
......@@ -77,23 +77,23 @@ static inline void odd_even_split_iw8_odd(
const int src_offset = src_idx * ic_step;
const int even_offset = (iw_idx + 1) / 2 * ic_step;
const int odd_offset = (odd_start + iw_idx / 2) * ic_step;
float32x4_t temp[8];
temp[0] = vld1q_f32(sptr + src_offset + 0 * ic_step);
temp[1] = vld1q_f32(sptr + src_offset + 1 * ic_step);
temp[2] = vld1q_f32(sptr + src_offset + 2 * ic_step);
temp[3] = vld1q_f32(sptr + src_offset + 3 * ic_step);
temp[4] = vld1q_f32(sptr + src_offset + 4 * ic_step);
temp[5] = vld1q_f32(sptr + src_offset + 5 * ic_step);
temp[6] = vld1q_f32(sptr + src_offset + 6 * ic_step);
temp[7] = vld1q_f32(sptr + src_offset + 7 * ic_step);
vst1q_f32(sptr_base + odd_offset + 0 * ic_step, temp[0]);
vst1q_f32(sptr_base + odd_offset + 1 * ic_step, temp[2]);
vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[4]);
vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[6]);
vst1q_f32(sptr_base + even_offset + 0 * ic_step, temp[1]);
vst1q_f32(sptr_base + even_offset + 1 * ic_step, temp[3]);
vst1q_f32(sptr_base + even_offset + 2 * ic_step, temp[5]);
vst1q_f32(sptr_base + even_offset + 3 * ic_step, temp[7]);
GI_FLOAT32_t temp[8];
temp[0] = GiLoadFloat32(sptr + src_offset + 0 * ic_step);
temp[1] = GiLoadFloat32(sptr + src_offset + 1 * ic_step);
temp[2] = GiLoadFloat32(sptr + src_offset + 2 * ic_step);
temp[3] = GiLoadFloat32(sptr + src_offset + 3 * ic_step);
temp[4] = GiLoadFloat32(sptr + src_offset + 4 * ic_step);
temp[5] = GiLoadFloat32(sptr + src_offset + 5 * ic_step);
temp[6] = GiLoadFloat32(sptr + src_offset + 6 * ic_step);
temp[7] = GiLoadFloat32(sptr + src_offset + 7 * ic_step);
GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, temp[0]);
GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, temp[2]);
GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, temp[4]);
GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, temp[6]);
GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, temp[1]);
GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, temp[3]);
GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, temp[5]);
GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, temp[7]);
}
} // namespace
......@@ -104,7 +104,7 @@ void pack_src_fp32_nchw44<2>(
const int pad_top, const int pad_bottom, const int ic, const int ic_stride) {
constexpr int ic_step = 4;
int odd_start = megdnn::div_ceil(iw2, 2);
float32x4_t zero_v = vdupq_n_f32(0.f);
GI_FLOAT32_t zero_v = GiZeroFloat32();
MEGDNN_MARK_USED_VAR(ph);
bool even_start = pw % 2 == 0;
rep_step(ic_idx, ic, ic_step) {
......@@ -115,9 +115,10 @@ void pack_src_fp32_nchw44<2>(
int iw_idx = 0;
rep(idx, pw) {
if (iw_idx % 2 == 0) {
vst1q_f32(sptr_base + iw_idx / 2 * ic_step, zero_v);
GiStoreFloat32(sptr_base + iw_idx / 2 * ic_step, zero_v);
} else {
vst1q_f32(sptr_base + (odd_start + iw_idx / 2) * ic_step, zero_v);
GiStoreFloat32(
sptr_base + (odd_start + iw_idx / 2) * ic_step, zero_v);
}
++iw_idx;
}
......@@ -136,21 +137,22 @@ void pack_src_fp32_nchw44<2>(
}
for (; src_idx < iw; ++src_idx) {
if (iw_idx % 2 == 0) {
vst1q_f32(
GiStoreFloat32(
sptr_base + iw_idx / 2 * ic_step,
vld1q_f32(sptr + src_idx * ic_step));
GiLoadFloat32(sptr + src_idx * ic_step));
} else {
vst1q_f32(
GiStoreFloat32(
sptr_base + (odd_start + iw_idx / 2) * ic_step,
vld1q_f32(sptr + src_idx * ic_step));
GiLoadFloat32(sptr + src_idx * ic_step));
}
++iw_idx;
}
rep(idx, pad_right) {
if (iw_idx % 2 == 0) {
vst1q_f32(sptr_base + iw_idx / 2 * ic_step, zero_v);
GiStoreFloat32(sptr_base + iw_idx / 2 * ic_step, zero_v);
} else {
vst1q_f32(sptr_base + (odd_start + iw_idx / 2) * ic_step, zero_v);
GiStoreFloat32(
sptr_base + (odd_start + iw_idx / 2) * ic_step, zero_v);
}
++iw_idx;
}
......@@ -163,7 +165,7 @@ void pack_src_fp32_nchw44<2>(
}
} // namespace conv_bias
} // namespace arm_common
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_NO_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_NO_BIAS(2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_NO_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_NO_BIAS(3);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_NO_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_NO_BIAS(5);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
INSTANTIATION_CONV_S1_NO_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
INSTANTIATION_CONV_S2_NO_BIAS(7);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -12,16 +12,15 @@
*/
#include "megdnn/arch.h"
#include "src/arm_common/conv_bias/fp32/f32_direct_nchw44_kern.h"
#include "src/arm_common/conv_bias/intrinsic_helper.h"
#include "src/arm_common/elemwise_helper/elemwise_op.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/gi/fp32/f32_direct_nchw44_kern.h"
#include "src/fallback/conv_bias/gi/intrinsic_helper.h"
#include "src/fallback/elemwise_helper/elemwise_op.h"
using namespace megdnn;
using namespace arm_common;
using namespace fallback;
namespace {
template <
......@@ -39,13 +38,13 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
};
#define cb2(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32( \
c[0][step] = GiSimdFmaLane( \
c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane); \
c[1][step] = vfmaq_laneq_f32( \
c[1][step] = GiSimdFmaLane( \
c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane);
#define cb(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32( \
#define cb(step, lane, ow_block) \
c[0][step] = GiSimdFmaLane( \
c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane);
#define SHIFT_CAL_HELPER(ow_block, remain_w) \
......@@ -122,7 +121,7 @@ public:
template <
BiasMode bias_mode, typename Op, int remain_w, int filter_size, int oc_block,
int ow_block>
struct KerNeonXXs1Nchw44FP32 {
struct KerGiXXs1Nchw44FP32 {
static void impl(
const float32_t* src_ptr, const float32_t* weight_ptr,
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, int iw,
......@@ -130,7 +129,7 @@ struct KerNeonXXs1Nchw44FP32 {
};
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, int ow_block>
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
static void impl(
const float32_t* src_ptr_origin, const float32_t* weight_ptr,
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, int iw,
......@@ -147,20 +146,20 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
const int ld_src_ic = ih * iw;
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
GI_FLOAT32_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
float32x4_t src[ow_block];
float32x4_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
GI_FLOAT32_t src[ow_block];
GI_FLOAT32_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
......@@ -172,7 +171,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
};
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, int ow_block>
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
static void impl(
const float32_t* src_ptr_origin, const float32_t* weight_ptr,
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, int iw,
......@@ -189,24 +188,24 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
const int ld_src_ic = ih * iw;
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
GI_FLOAT32_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
float32x4_t src[ow_block];
float32x4_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
GI_FLOAT32_t src[ow_block];
GI_FLOAT32_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
......@@ -217,7 +216,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
}
};
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, int ow_block>
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
static void impl(
const float32_t* src_ptr_origin, const float32_t* weight_ptr,
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, int iw,
......@@ -234,36 +233,36 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
const int ld_src_ic = ih * iw;
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
GI_FLOAT32_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
float32x4_t src[ow_block];
float32x4_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
GI_FLOAT32_t src[ow_block];
GI_FLOAT32_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[2] = GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[3] = GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
......@@ -275,7 +274,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
};
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, int ow_block>
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
static void impl(
const float32_t* src_ptr_origin, const float32_t* weight_ptr,
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, int iw,
......@@ -292,46 +291,46 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
const int ld_src_ic = ih * iw;
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
GI_FLOAT32_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
float32x4_t src[ow_block];
float32x4_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
GI_FLOAT32_t src[ow_block];
GI_FLOAT32_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[2] = GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[3] = GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[4] = vld1q_f32(src_ptr + (ow_block + 4) * ic_step);
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[4] = GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step);
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[5] = vld1q_f32(src_ptr + (ow_block + 5) * ic_step);
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>(
src[5] = GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step);
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
......@@ -352,10 +351,10 @@ void conv_bias::conv_direct_fp32_nchw44(
constexpr int fh = filter_size;
constexpr int fw = filter_size;
constexpr int ic_step = 4;
#if MEGDNN_ARMV7
constexpr int big_oc_step = 4;
#else
#if MEGDNN_AARCH64
constexpr int big_oc_step = 8;
#else
constexpr int big_oc_step = 4;
#endif
constexpr int oc_step = 4;
constexpr int ih_step = 1;
......@@ -381,9 +380,9 @@ void conv_bias::conv_direct_fp32_nchw44(
switch (ow_remain) {
#define cb(step) \
case step: \
kern_big_oc_remain = KerNeonXXs1Nchw44FP32< \
kern_big_oc_remain = KerGiXXs1Nchw44FP32< \
bias_mode, Op, step, filter_size, big_oc_step, ow_step>::impl; \
kern_small_oc_remain = KerNeonXXs1Nchw44FP32< \
kern_small_oc_remain = KerGiXXs1Nchw44FP32< \
bias_mode, Op, step, filter_size, oc_step, ow_step>::impl; \
break;
......@@ -402,7 +401,7 @@ void conv_bias::conv_direct_fp32_nchw44(
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
const int bias_offset =
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx;
KerNeonXXs1Nchw44FP32<
KerGiXXs1Nchw44FP32<
bias_mode, Op, ow_step, filter_size, big_oc_step, ow_step>::
impl(src + src_offset, filter + weight_offset,
bias + bias_offset, dst + dst_offset, ic, ih, iw,
......@@ -434,7 +433,7 @@ void conv_bias::conv_direct_fp32_nchw44(
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
const int bias_offset =
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx;
KerNeonXXs1Nchw44FP32<
KerGiXXs1Nchw44FP32<
bias_mode, Op, ow_step, filter_size, oc_step, ow_step>::
impl(src + src_offset, filter + weight_offset,
bias + bias_offset, dst + dst_offset, ic, ih, iw,
......
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(2, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(2, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(2, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(2, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(3, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(3, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(3, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(3, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(5, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(5, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(5, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_NO_BIAS(5, 2);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BIAS(7, 1);
// vim: syntax=cpp.doxygen
/**
* \file
* dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
* dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -10,6 +10,6 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
#include "src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(7, 1);
// vim: syntax=cpp.doxygen
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册