提交 580a2753 编写于 作者: M Megvii Engine Team

feat(dnn/arm): add nchw44 fp32 direct stride 1

GitOrigin-RevId: 65f54a4f7ea754b16588baa1dc017ebe6599940d
上级 caf1fac2
......@@ -178,6 +178,22 @@ public:
const NCBKernSizeParam& param) const override;
};
class ConvBiasImpl::AlgoF32DirectNCHW44 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
public:
AlgoF32DirectNCHW44() {}
bool is_reproducible() const override { return true; }
const char* name() const override { return "F32_CONV_NCHW44_DIRECT"; }
bool usable(fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(fallback::ConvBiasImpl*,
const NCBKernSizeParam& param) const override;
virtual SmallVector<NCBKern> dispatch_kerns(
fallback::ConvBiasImpl* opr,
const NCBKernSizeParam& param) const override;
};
class ConvBiasImpl::AlgoF32DirectStride2NCHW44 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
......@@ -12,10 +12,9 @@
#include "megdnn/oprs.h"
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h"
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h"
#include "src/arm_common/conv_bias/fp32/strategy.h"
#include "src/arm_common/elemwise_op.h"
#include "src/common/opr_delegate.h"
#include "midout.h"
......@@ -25,7 +24,7 @@ using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride2)
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1)
namespace {
// block_helper is used to calculate oh block size
static inline int block_helper(const int nthread, const int amount,
......@@ -79,7 +78,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
return {nullptr, {src_size * param.nr_threads}};
};
template <size_t filter, BiasMode bias_mode, typename Op>
template <size_t filter, BiasMode bias_mode, typename Op, int stride>
static void do_conv_kern(WorkspaceBundle bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
......@@ -125,11 +124,17 @@ static void do_conv_kern(WorkspaceBundle bundle,
const size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2);
float* sptr = reinterpret_cast<float*>((int8_t*)bundle.get(0) +
ncb_index.thread_id * src_size);
conv_bias::pack_src_fp32_nchw44_stride2(
sptr, origin_sptr, ph, pw, remain_right_pad,
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad,
src_bottom_pad, ic, ih * iw);
if (stride == 1) {
conv_bias::pack_src_fp32_nchw44_stride1(
sptr, origin_sptr, ph, pw, remain_right_pad,
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad,
src_bottom_pad, ic, ih * iw);
} else {
conv_bias::pack_src_fp32_nchw44_stride2(
sptr, origin_sptr, ph, pw, remain_right_pad,
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad,
src_bottom_pad, ic, ih * iw);
}
const float* fptr =
kern_param.filter<dt_float32>(group_id) + oc_idx * fh * fw * ic;
......@@ -142,46 +147,59 @@ static void do_conv_kern(WorkspaceBundle bundle,
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset;
Op op;
if (stride == 1) {
#define KERN1_NCHW44_CONV(filter) \
conv_bias::conv_direct_stride1_##filter##x##filter##_fp32_nchw44< \
\
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw)
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV);
#undef KERN1_NCHW44_CONV
} else {
#define KERN1_NCHW44_CONV(filter) \
conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \
\
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw)
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV);
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV);
#undef KERN1_NCHW44_CONV
}
}
} // namespace
/* ===================== stride2 algo ===================== */
bool ConvBiasImpl::AlgoF32DirectStride2NCHW44::usable(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
AlgoSelectionStrategy) const {
/* ===================== stride1 algo ===================== */
bool ConvBiasImpl::AlgoF32DirectNCHW44::usable(fallback::ConvBiasImpl*,
const NCBKernSizeParam& param,
AlgoSelectionStrategy) const {
auto&& fm = param.filter_meta;
auto fh = fm.spatial[0];
int oc = fm.ocpg;
int ic = fm.icpg;
bool ok_type = ((param.src_type.enumv() == DTypeEnum::Float32 &&
param.filter_type.enumv() == DTypeEnum::Float32 &&
(param.dst_type.enumv() == DTypeEnum::Float32))) &&
(fm.format == param::Convolution::Format::NCHW44);
bool ok_src_dst = (oc % 4 == 0 && oc >= 4);
bool ok_src_dst = (oc % 4 == 0 && oc >= 4 && ic % 4 == 0 && ic >= 4);
bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] &&
(fh == 2 || fh == 3 || fh == 5 || fh == 7);
bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
fm.stride[0] == 2 && fm.stride[1] == 2;
((fm.stride[0] == 1 && fm.stride[1] == 1) ||
(fm.stride[0] == 2 && fm.stride[1] == 2));
bool ok_conv = !fm.should_flip;
bool avaible = ok_type && ok_src_dst && ok_filter && ok_slide && ok_conv;
return avaible;
}
size_t ConvBiasImpl::AlgoF32DirectStride2NCHW44::get_workspace(
size_t ConvBiasImpl::AlgoF32DirectNCHW44::get_workspace(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns(
ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
auto fm = param.filter_meta;
const int batch = param.n;
......@@ -190,27 +208,43 @@ ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns(
conv_fun do_conv_fun = nullptr;
// NOTE: remain_w is not used to gen hash of midout for compatible with
// shape runtime
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride2, \
midout_iv(#filter #bias_mode #op##_hash)) { \
do_conv_fun = do_conv_kern<filter, bias_mode, op>; \
} \
#define DO_CONV_KERN_FUN(filter, bias_mode, op, stride) \
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride1, \
midout_iv(#filter #bias_mode #stride #op##_hash)) { \
do_conv_fun = do_conv_kern<filter, bias_mode, op, stride>; \
} \
MIDOUT_END();
#define GET_OP_PARAM(filter, bias_mode) \
switch (param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(filter, bias_mode, NoneOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(filter, bias_mode, ReluOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(filter, bias_mode, HSwishOp<dt_float32>) \
break; \
default: \
megdnn_assert(0); \
break; \
#define GET_STRIDE_PARAM(filter, bias_mode, op) \
switch (fm.stride[0]) { \
case 1: \
DO_CONV_KERN_FUN(filter, bias_mode, op, 1); \
break; \
case 2: \
DO_CONV_KERN_FUN(filter, bias_mode, op, 2); \
break; \
\
default: \
megdnn_assert(0); \
}
#define GET_OP_PARAM(filter, bias_mode) \
switch (param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
GET_STRIDE_PARAM(filter, bias_mode, NoneOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
GET_STRIDE_PARAM(filter, bias_mode, ReluOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
GET_STRIDE_PARAM(filter, bias_mode, HSwishOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::SIGMOID: \
GET_STRIDE_PARAM(filter, bias_mode, SigmoidOp<dt_float32>) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_BIAS_MODE_PARAM(filter) \
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
namespace megdnn {
namespace arm_common {
namespace conv_bias {
#define KERN(stride, i, layout) \
template <BiasMode bias_mode, typename Op> \
void conv_direct_##stride##_##i##x##i##_fp32_##layout( \
const float* src, const float* filter, const float* bias, \
float* temp, float* dst, const int oc, const int ic, const int ih, \
const int iw, const int oh, const int oh_block, const int ow, \
const Op& op, const int ph, const int pw);
KERN(stride1, 2, nchw44)
KERN(stride1, 3, nchw44)
KERN(stride1, 5, nchw44)
KERN(stride1, 7, nchw44)
#undef KERN
void pack_src_fp32_nchw44_stride1(float* sptr_base, const float* sptr_origin,
const int ph, const int pw,
const int pad_right, const int ih,
const int iw, const int iw2,
const int pad_top, const int pad_bottom,
const int ic, const int ic_stride);
} // namespace conv_bias
} // namespace arm_common
} // namespace megdnn
......@@ -721,10 +721,11 @@ CONSTRUCT_FUNC(7);
const int, const int, const int, const int, const Op&, \
const int, const int);
#define FOR_OP(stride, i, bias) \
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>)
#define FOR_OP(stride, i, bias) \
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>) \
INSTANTIATION(stride, i, bias, SigmoidOp<dt_float32>)
#define FOR_BIAS(stride, i) \
FOR_OP(stride, i, BiasMode::NO_BIAS) \
......
......@@ -67,7 +67,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoF32Direct f32_direct_large_group{true};
AlgoF32Direct f32_direct_small_group{false};
AlgoF32DirectStride2NCHW44 f32_direct_stride2_nchw44;
AlgoF32DirectNCHW44 f32_direct_nchw44;
AlgoF32DirectStride2 f32_direct_stride2_large_group{true};
AlgoF32DirectStride2 f32_direct_stride2_small_group{false};
AlgoF32DirectStride1 f32_direct_stride1_large_group{true};
......@@ -126,8 +126,7 @@ public:
direct_algos.emplace_back(&i8x8x16_stride2_large_group);
direct_algos.emplace_back(&i8x8x16_stride2_small_group);
direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44);
direct_algos.emplace_back(&f32_direct_stride2_nchw44);
direct_algos.emplace_back(&f32_direct_nchw44);
direct_algos.emplace_back(&f32_direct_stride1_large_group);
direct_algos.emplace_back(&f32_direct_stride1_small_group);
direct_algos.emplace_back(&f32_direct_stride2_large_group);
......
......@@ -66,10 +66,11 @@ private:
#endif
class AlgoF32Direct;
class AlgoF32DirectStride1;
class AlgoF32DirectNCHW44;
class AlgoF32DirectStride2;
class AlgoF32DirectStride2NCHWNCHW44;
class AlgoF32DirectStride2NCHW44;
class AlgoI8x8x16Direct;
class AlgoI8x8x16Stride2;
class AlgoI8x8x16Stride2Filter2;
......
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
......@@ -43,6 +44,10 @@ struct SigmoidOp;
vst1q_##_func_suffix(dst, vitem.val[0]); \
vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _neon_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
vst1q_##_func_suffix(dst, vitem); \
} \
_neon_type2 operator()(const _neon_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
......
......@@ -203,11 +203,9 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
run(1, 128, 256, 28, 28, 3, 2, false);
run(1, 256, 512, 14, 14, 3, 2, false);
run(1, 64, 128, 56, 56, 7, 2, false);
run(1, 128, 256, 28, 28, 7, 2, false);
run(1, 256, 512, 14, 14, 7, 2, false);
run(1, 64, 64, 48, 48, 3, 2, false);
run(1, 128, 128, 28, 28, 3, 1, false);
run(1, 256, 256, 14, 14, 3, 1, false);
run(1, 512, 512, 7, 7, 3, 1, false);
} else {
for (size_t stride : {1, 2}) {
printf("stride %zu\n", stride);
......
......@@ -72,7 +72,8 @@ std::vector<conv_bias::TestArg> get_int8_quint8_conv_bias_args(
std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
std::vector<size_t> kernel_vec, size_t stride, bool no_pad = false,
bool no_bias = false, bool no_nonlinemode = false,
bool is_input_nchw = false, bool support_full_bias = false) {
bool is_input_nchw = false, bool support_full_bias = false,
bool support_sigmoid = false) {
using namespace conv_bias;
using NLMode = param::ConvBias::NonlineMode;
std::vector<TestArg> args;
......@@ -151,6 +152,9 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
nonlinemode.emplace_back(NLMode::RELU);
nonlinemode.emplace_back(NLMode::H_SWISH);
}
if (support_sigmoid) {
nonlinemode.emplace_back(NLMode::SIGMOID);
}
std::vector<megdnn::BiasMode> bias_mode = {
megdnn::BiasMode::BROADCAST_CHANNEL_BIAS};
......@@ -337,11 +341,16 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) {
get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
handle(), "F32DIRECT_SMALL_GROUP");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S1) {
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false,
false, false, true, true),
handle(), "F32_CONV_NCHW44_DIRECT");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S2) {
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false,
false, false, true),
handle(), "F32_CONV_NCHW44_DIRECT_S2");
false, false, true, true),
handle(), "F32_CONV_NCHW44_DIRECT");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP) {
......@@ -682,8 +691,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
size_t conv_bias_workspace_in_bytes =
conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout, nullptr);
tensors[2].layout, tensors[3].layout, tensors[4].layout,
nullptr);
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册