提交 66950a4f 编写于 作者: M Megvii Engine Team

feat(dnn/arm): add nchw44 fp32 direct conv stride2

GitOrigin-RevId: 4106b46b6cea3b496a8dfcfae80f6383a96d7739
上级 bfe945fb
......@@ -178,6 +178,23 @@ public:
const NCBKernSizeParam& param) const override;
};
class ConvBiasImpl::AlgoF32DirectStride2NCHW44 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
public:
AlgoF32DirectStride2NCHW44() {}
bool is_reproducible() const override { return true; }
const char* name() const override { return "F32_CONV_NCHW44_DIRECT_S2"; }
bool usable(fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(fallback::ConvBiasImpl*,
const NCBKernSizeParam& param) const override;
virtual SmallVector<NCBKern> dispatch_kerns(
fallback::ConvBiasImpl* opr,
const NCBKernSizeParam& param) const override;
};
class ConvBiasImpl::AlgoF32DirectStride1 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
bool m_large_group;
......
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied.
*/
#include "megdnn/oprs.h"
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h"
#include "src/arm_common/conv_bias/fp32/strategy.h"
#include "src/arm_common/elemwise_op.h"
#include "src/common/opr_delegate.h"
#include "midout.h"
using namespace megdnn;
using namespace arm_common;
using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride2)
namespace {
// block_helper is used to calculate oh block size
static inline int block_helper(const int nthread, const int amount,
const int size_per_unit) {
constexpr int l2_cache_size = 256 * 1024;
const int block_per_thread = div_ceil(amount, nthread);
const int best_block = std::min(
amount, (l2_cache_size + size_per_unit / 2) / size_per_unit);
const int max_block_num = div_ceil(block_per_thread, best_block);
const int min_block_num = std::max(max_block_num - 1, 1);
const int max_block = div_ceil(block_per_thread, max_block_num);
const int min_block = div_ceil(block_per_thread, min_block_num);
const int max_loss = std::abs(max_block_num * max_block - block_per_thread);
const int min_loss = std::abs(min_block_num * min_block - block_per_thread);
int block = max_loss > min_loss ? min_block : max_block;
return block;
}
static inline size_t get_perthread_cache_bytes(const int ic, const int ih2,
const int iw2) {
// border_size is used to avoid read illegal memory
int border_size = 64 * 2;
return ic * ih2 * iw2 * sizeof(float) + border_size;
}
static void get_rectified_size(
const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param, int& ih2,
int& iw2, int& oh2, int& ow2) {
int ic = param.filter_meta.icpg;
int iw = param.isz[1];
int oh = param.osz[0];
int ow = param.osz[1];
oh2 = oh;
ow2 = ow;
constexpr int cacheline = 64 / sizeof(float);
int block_oh =
block_helper(param.nr_threads, oh, ic * iw * sizeof(float) * 2);
auto&& fm = param.filter_meta;
const int stride_h = static_cast<int>(fm.stride[0]);
const int filter_h = static_cast<int>(fm.spatial[0]);
ih2 = block_oh * stride_h + filter_h - stride_h;
iw2 = round_up(iw + 2 * static_cast<int>(fm.padding[1]), cacheline);
}
static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
auto&& fm = param.filter_meta;
int ic = fm.icpg;
int ih2, iw2, oh2, ow2;
get_rectified_size(param, ih2, iw2, oh2, ow2);
size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2);
return {nullptr, {src_size * param.nr_threads}};
};
template <size_t filter, BiasMode bias_mode, typename Op>
static void do_conv_kern(WorkspaceBundle bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange&, const CpuNDRange&) {
const int oh = kern_param.osz[0];
const int ow = kern_param.osz[1];
const int fh = kern_param.filter_meta.spatial[0];
const int fw = kern_param.filter_meta.spatial[1];
const int ic = kern_param.filter_meta.icpg;
const int oc = kern_param.filter_meta.ocpg;
const int ih = kern_param.isz[0];
const int iw = kern_param.isz[1];
const int stride_h = kern_param.filter_meta.stride[0];
const int ph = kern_param.filter_meta.padding[0];
const int pw = kern_param.filter_meta.padding[1];
int ih2 = 0;
int iw2 = 0;
int oh2 = 0;
int ow2 = 0;
get_rectified_size(kern_param, ih2, iw2, oh2, ow2);
bundle.set(kern_param.workspace_ptr);
constexpr int pack_c = 4;
const int batch_id = ncb_index.ndrange_id[0];
const int group_id = ncb_index.ndrange_id[1];
constexpr int oc_idx = 0;
int oc_block = oc;
int oh_block = block_helper(kern_param.nr_threads, oh2,
ic * iw * sizeof(float) * 2);
const int oh_idx = ncb_index.ndrange_id[2];
const int oh_block_real = std::min(oh - oh_idx * oh_block, oh_block);
const int ih_real = oh_block_real * stride_h + fh - stride_h;
const int src_top_pad = std::max(ph - oh_idx * oh_block * stride_h, 0);
const int src_bottom_pad = std::max(
(oh_idx * oh_block + oh_block_real - 1) * stride_h + fh - ih - ph,
0);
const int remain_right_pad = std::max(iw2 - iw - pw, 0);
const int src_offset =
std::max(oh_idx * oh_block * stride_h - ph, 0) * iw * pack_c;
const float* origin_sptr = static_cast<const float*>(kern_param.src<float>(
batch_id, group_id, 0, 1, 1)) +
src_offset;
const size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2);
float* sptr = reinterpret_cast<float*>((int8_t*)bundle.get(0) +
ncb_index.thread_id * src_size);
conv_bias::pack_src_fp32_nchw44_stride2(
sptr, origin_sptr, ph, pw, remain_right_pad,
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad,
src_bottom_pad, ic, ih * iw);
const float* fptr =
kern_param.filter<dt_float32>(group_id) + oc_idx * fh * fw * ic;
float_t* dst = kern_param.dst<float_t>(batch_id, group_id) +
oh_idx * oh_block * ow * pack_c;
const int bias_offset = bias_mode == BiasMode::BIAS
? oh_idx * oh_block * ow * pack_c
: oc_idx;
const float* bptr =
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset;
Op op;
#define KERN1_NCHW44_CONV(filter) \
conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \
\
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw)
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV);
#undef KERN1_NCHW44_CONV
}
} // namespace
/* ===================== stride2 algo ===================== */
bool ConvBiasImpl::AlgoF32DirectStride2NCHW44::usable(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
AlgoSelectionStrategy) const {
auto&& fm = param.filter_meta;
auto fh = fm.spatial[0];
int oc = fm.ocpg;
bool ok_type = ((param.src_type.enumv() == DTypeEnum::Float32 &&
param.filter_type.enumv() == DTypeEnum::Float32 &&
(param.dst_type.enumv() == DTypeEnum::Float32))) &&
(fm.format == param::Convolution::Format::NCHW44);
bool ok_src_dst = (oc % 4 == 0 && oc >= 4);
bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] &&
(fh == 2 || fh == 3 || fh == 5 || fh == 7);
bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
fm.stride[0] == 2 && fm.stride[1] == 2;
bool ok_conv = !fm.should_flip;
bool avaible = ok_type && ok_src_dst && ok_filter && ok_slide && ok_conv;
return avaible;
}
size_t ConvBiasImpl::AlgoF32DirectStride2NCHW44::get_workspace(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns(
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
auto fm = param.filter_meta;
const int batch = param.n;
const int group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
conv_fun do_conv_fun = nullptr;
// NOTE: remain_w is not used to gen hash of midout for compatible with
// shape runtime
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride2, \
midout_iv(#filter #bias_mode #op##_hash)) { \
do_conv_fun = do_conv_kern<filter, bias_mode, op>; \
} \
MIDOUT_END();
#define GET_OP_PARAM(filter, bias_mode) \
switch (param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(filter, bias_mode, NoneOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(filter, bias_mode, ReluOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(filter, bias_mode, HSwishOp<dt_float32>) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_BIAS_MODE_PARAM(filter) \
switch (param.bias_mode) { \
case BiasMode::NO_BIAS: \
GET_OP_PARAM(filter, BiasMode::NO_BIAS) \
break; \
case BiasMode::BROADCAST_CHANNEL_BIAS: \
GET_OP_PARAM(filter, BiasMode::BROADCAST_CHANNEL_BIAS) \
break; \
case BiasMode::BIAS: \
GET_OP_PARAM(filter, BiasMode::BIAS) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define DISPATCH_CONV_KERN() \
switch (param.filter_meta.spatial[0]) { \
case 2: \
GET_BIAS_MODE_PARAM(2) \
break; \
case 3: \
GET_BIAS_MODE_PARAM(3) \
break; \
case 5: \
GET_BIAS_MODE_PARAM(5) \
break; \
case 7: \
GET_BIAS_MODE_PARAM(7) \
break; \
default: \
megdnn_assert(0); \
break; \
}
DISPATCH_CONV_KERN();
#undef DO_CONV_KERN_FUN
#undef GET_REMAIN_W_PARAM
#undef GET_OP_PARAM
#undef GET_BIAS_MODE_PARAM
#undef DISPATCH_CONV_KERN
megdnn_assert(do_conv_fun);
SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;
int oh = param.osz[0];
int ic = param.filter_meta.icpg;
int iw = param.isz[1];
int oh_block =
block_helper(param.nr_threads, oh, ic * iw * sizeof(float) * 2);
CpuNDRange ncb_range = {static_cast<size_t>(batch),
static_cast<size_t>(group),
static_cast<size_t>(div_ceil(oh, oh_block))};
auto do_conv = [bundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range);
};
ret_kerns.push_back({do_conv, ncb_range});
return ret_kerns;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
namespace megdnn {
namespace arm_common {
namespace conv_bias {
#define KERN(stride, i, layout) \
template <BiasMode bias_mode, typename Op> \
void conv_direct_##stride##_##i##x##i##_fp32_##layout( \
const float* src, const float* filter, const float* bias, \
float* temp, float* dst, const int oc, const int ic, const int ih, \
const int iw, const int oh, const int oh_block, const int ow, \
const Op& op, const int ph, const int pw);
KERN(stride2, 2, nchw44)
KERN(stride2, 3, nchw44)
KERN(stride2, 5, nchw44)
KERN(stride2, 7, nchw44)
#undef KERN
void pack_src_fp32_nchw44_stride2(float* sptr_base, const float* sptr_origin,
const int ph, const int pw,
const int pad_right, const int ih,
const int iw, const int iw2,
const int pad_top, const int pad_bottom,
const int ic, const int ic_stride);
} // namespace conv_bias
} // namespace arm_common
} // namespace megdnn
......@@ -111,7 +111,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block> {
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
......@@ -157,7 +157,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block> {
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
......@@ -201,7 +201,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block> {
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
......@@ -258,7 +258,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block> {
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
......
......@@ -194,7 +194,20 @@ struct StoreOcxOw8Remain<2, 0, Op, T> {
op({{c[1][6], c[1][7]}}, dst_ptr + ld_dst_oc + 24);
}
};
template <typename Op, typename T>
struct StoreOcxOw8Remain<2, 8, Op, T> {
static void impl(T& c, const Op& op, float32_t* dst_ptr, int ld_dst_oc) {
op({{c[0][0], c[0][1]}}, dst_ptr);
op({{c[0][2], c[0][3]}}, dst_ptr + 8);
op({{c[0][4], c[0][5]}}, dst_ptr + 16);
op({{c[0][6], c[0][7]}}, dst_ptr + 24);
op({{c[1][0], c[1][1]}}, dst_ptr + ld_dst_oc);
op({{c[1][2], c[1][3]}}, dst_ptr + ld_dst_oc + 8);
op({{c[1][4], c[1][5]}}, dst_ptr + ld_dst_oc + 16);
op({{c[1][6], c[1][7]}}, dst_ptr + ld_dst_oc + 24);
}
};
template <typename Op, typename T>
struct StoreOcxOw8Remain<2, 7, Op, T> {
static void impl(T& c, const Op& op, float32_t* dst_ptr, int ld_dst_oc) {
......@@ -277,6 +290,15 @@ struct StoreOcxOw8Remain<1, 0, Op, T> {
op({{c[0][6], c[0][7]}}, dst_ptr + 24);
}
};
template <typename Op, typename T>
struct StoreOcxOw8Remain<1, 8, Op, T> {
static void impl(T& c, const Op& op, float32_t* dst_ptr, int) {
op({{c[0][0], c[0][1]}}, dst_ptr);
op({{c[0][2], c[0][3]}}, dst_ptr + 8);
op({{c[0][4], c[0][5]}}, dst_ptr + 16);
op({{c[0][6], c[0][7]}}, dst_ptr + 24);
}
};
template <typename Op, typename T>
struct StoreOcxOw8Remain<1, 7, Op, T> {
......@@ -499,46 +521,127 @@ inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr,
}
}
/////////////////////////init_ocx_ow8////////////////////
template <int c_dim, BiasMode bias_mode, typename T, typename T2>
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
struct InitOcxOw8 {
static void impl(T& c, T2 bias_ptr, int oc_step);
};
template <BiasMode bias_mode, typename T, typename T2>
struct InitOcxOw8<2, bias_mode, T, T2> {
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> {
static void impl(T& c, const float32_t*, int) {
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> {
static void impl(T& c, const float32_t*, int) {
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int oc_step) {
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr); \
c[1][step] = vld1q_f32(bias_ptr + oc_step);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
} else {
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int oc_step) {
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
c[0][step] = vld1q_f32(bias_ptr); \
c[1][step] = vld1q_f32(bias_ptr + oc_step);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int oc_step) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int oc_step) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> {
static void impl(T& c, const float32_t*, int) {
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> {
static void impl(T& c, const float32_t*, int) {
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <BiasMode bias_mode, typename T, typename T2>
struct InitOcxOw8<1, bias_mode, T, T2> {
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int) {
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
} else {
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int) {
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> {
static void impl(T& c, const float32_t* bias_ptr, int) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <int c_dim, BiasMode bias_mode, typename T, typename T2>
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
inline void init_ocx_ow8(T& c, T2 bias_ptr, int oc_step) {
InitOcxOw8<c_dim, bias_mode, T, T2>::impl(c, bias_ptr, oc_step);
InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step);
}
/////////////////////init_ocx_ow4/////////////////////
template <int c_dim, BiasMode bias_mode, typename T>
......@@ -638,6 +741,20 @@ struct LoadHelper<6, base_offset, ptr_step, 0, Func, T, T2, XT...> {
UNROLL_CALL_RAW(6, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<7, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(7, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<8, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(8, WEIGHT_CB);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
......@@ -674,6 +791,11 @@ struct LoadHelper<7, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(7, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<8, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(8, WEIGHT_CB); }
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
......@@ -724,6 +846,13 @@ struct LoadHelper<7, base_offset, ptr_step, 2, Func, T, T2> {
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<8, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(8, WEIGHT_CB);
}
};
#undef WEIGHT_CB
template <int weight_number, int base_offset, int ptr_step, int c_dim,
......
......@@ -67,6 +67,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoF32Direct f32_direct_large_group{true};
AlgoF32Direct f32_direct_small_group{false};
AlgoF32DirectStride2NCHW44 f32_direct_stride2_nchw44;
AlgoF32DirectStride2 f32_direct_stride2_large_group{true};
AlgoF32DirectStride2 f32_direct_stride2_small_group{false};
AlgoF32DirectStride1 f32_direct_stride1_large_group{true};
......@@ -125,6 +126,8 @@ public:
direct_algos.emplace_back(&i8x8x16_stride2_large_group);
direct_algos.emplace_back(&i8x8x16_stride2_small_group);
direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44);
direct_algos.emplace_back(&f32_direct_stride2_nchw44);
direct_algos.emplace_back(&f32_direct_stride1_large_group);
direct_algos.emplace_back(&f32_direct_stride1_small_group);
direct_algos.emplace_back(&f32_direct_stride2_large_group);
......
......@@ -68,6 +68,8 @@ private:
class AlgoF32DirectStride1;
class AlgoF32DirectStride2;
class AlgoF32DirectStride2NCHWNCHW44;
class AlgoF32DirectStride2NCHW44;
class AlgoI8x8x16Direct;
class AlgoI8x8x16Stride2;
class AlgoI8x8x16Stride2Filter2;
......
......@@ -198,6 +198,16 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
run(1, 1, 4, 112, 112, 2, 2, true);
run(1, 3, 32, 224, 224, 3, 2, true);
run(1, 3, 64, 224, 224, 7, 2, true);
run(1, 64, 128, 56, 56, 3, 2, false);
run(1, 128, 256, 28, 28, 3, 2, false);
run(1, 256, 512, 14, 14, 3, 2, false);
run(1, 64, 128, 56, 56, 7, 2, false);
run(1, 128, 256, 28, 28, 7, 2, false);
run(1, 256, 512, 14, 14, 7, 2, false);
run(1, 64, 64, 48, 48, 3, 2, false);
} else {
for (size_t stride : {1, 2}) {
printf("stride %zu\n", stride);
......
......@@ -72,18 +72,16 @@ std::vector<conv_bias::TestArg> get_int8_quint8_conv_bias_args(
std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
std::vector<size_t> kernel_vec, size_t stride, bool no_pad = false,
bool no_bias = false, bool no_nonlinemode = false,
bool is_input_nchw = false) {
bool is_input_nchw = false, bool support_full_bias = false) {
using namespace conv_bias;
using NLMode = param::ConvBias::NonlineMode;
std::vector<TestArg> args;
auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w,
size_t kernel, size_t stride, size_t group, NLMode nlmode,
int any_pad = -1) {
megdnn::BiasMode bias_mode, int any_pad = -1) {
constexpr int pack_c = 4;
const size_t pad = any_pad >= 0 ? any_pad : kernel / 2;
auto bias_mode = no_bias ? megdnn::BiasMode::NO_BIAS
: megdnn::BiasMode::BROADCAST_CHANNEL_BIAS;
auto oc_per_group = oc / group;
auto ic_per_group = ic / group;
bool ok_group = (oc % group == 0 && ic % group == 0) &&
......@@ -116,6 +114,10 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
auto bias_tensor_shape = TensorShape{};
if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) {
bias_tensor_shape = {1, oc / pack_c, 1, 1, pack_c};
} else if (bias_mode == megdnn::BiasMode::BIAS) {
bias_tensor_shape = {n, oc / pack_c,
(h + 2 * pad - kernel) / stride + 1,
(w + 2 * pad - kernel) / stride + 1, pack_c};
}
if (group == 1) {
param.sparse = param::ConvBias::Sparse::DENSE;
......@@ -149,6 +151,16 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
nonlinemode.emplace_back(NLMode::RELU);
nonlinemode.emplace_back(NLMode::H_SWISH);
}
std::vector<megdnn::BiasMode> bias_mode = {
megdnn::BiasMode::BROADCAST_CHANNEL_BIAS};
if (no_bias) {
bias_mode.emplace_back(megdnn::BiasMode::NO_BIAS);
}
if (support_full_bias) {
bias_mode.emplace_back(megdnn::BiasMode::BIAS);
}
for (auto bias : bias_mode)
for (auto nlmode : nonlinemode)
for (size_t n : {1, 2})
for (size_t kernel : kernel_vec)
......@@ -158,8 +170,8 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
for (size_t w : {7, 16, 23}) {
for (size_t group = 1;
group <= std::min(oc, ic); ++group) {
pack(n, oc, ic, h, w, kernel, stride, group,
nlmode);
pack(n, oc, ic, h, w, kernel, stride,
group, nlmode, bias);
}
}
return args;
......@@ -325,6 +337,13 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) {
get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
handle(), "F32DIRECT_SMALL_GROUP");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S2) {
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false,
false, false, true),
handle(), "F32_CONV_NCHW44_DIRECT_S2");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP) {
check_conv_bias(get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
handle(), "F32STRD1_LARGE_GROUP");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册