提交 ad3c9315 编写于 作者: M Megvii Engine Team

feat(dnn/arm): add arm nchw44 fp32 pooling

GitOrigin-RevId: 6a26dad0a1dd2963b1b8f4d6c746dfcb58d911b5
上级 d19a6379
#pragma once
/**
* \file dnn/src/arm_common/conv_bias/intrinsic_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
......@@ -10,7 +9,9 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/neon_struct.h"
#pragma once
#include "src/arm_common/intrinsic_helper.h"
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
#include "src/fallback/conv_bias/common.h"
......@@ -689,185 +690,8 @@ inline void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) {
InitOcxOw4<c_dim, bias_mode, T>::impl(c, bias_ptr, oc_step);
}
///////////////////////////////////////
template <int weight_number, int base_offset, int ptr_step, int oc_block,
typename Func, typename T, typename T2, typename... XT>
struct LoadHelper {
static void impl(T& weight, T2 ptr, int oc_offset, XT... args);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<1, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(1, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<2, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(2, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<3, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(3, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<4, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(4, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<5, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(5, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<6, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(6, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<7, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(7, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2,
typename... XT>
struct LoadHelper<8, base_offset, ptr_step, 0, Func, T, T2, XT...> {
static void impl(T& src, T2 ptr, int, XT... args) {
UNROLL_CALL_RAW(8, WEIGHT_CB);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<1, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(1, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<2, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(2, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<3, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(3, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<4, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(4, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<5, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(5, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<6, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(6, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<7, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(7, WEIGHT_CB); }
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<8, base_offset, ptr_step, 1, Func, T, T2> {
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(8, WEIGHT_CB); }
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<1, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(1, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<2, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(2, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<3, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(3, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<4, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(4, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<5, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(5, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<6, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(6, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<7, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(7, WEIGHT_CB);
}
};
template <int base_offset, int ptr_step, typename Func, typename T, typename T2>
struct LoadHelper<8, base_offset, ptr_step, 2, Func, T, T2> {
static void impl(T& src, T2 ptr, int oc_offset) {
UNROLL_CALL_RAW(8, WEIGHT_CB);
}
};
#undef WEIGHT_CB
template <int weight_number, int base_offset, int ptr_step, int c_dim,
typename Func, typename T, typename T2>
inline void load_helper(T& weight, T2 ptr, int oc_offset) {
LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl(
weight, ptr, oc_offset);
}
template <int weight_number, int base_offset, int ptr_step, int c_dim,
typename Func, typename T, typename T2, typename... XT>
inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2,
XT...>::impl(weight, ptr, oc_offset, args...);
}
} // namespace
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/arm_common/intrinsic_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
namespace megdnn {
namespace {
template <int weight_number, int base_offset, int ptr_step, int oc_block,
typename Func, typename T, typename T2, typename... XT>
struct LoadHelper {
static void impl(T& weight, T2 ptr, int oc_offset, XT... args);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2, typename... XT> \
struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> { \
static void impl(T& src, T2 ptr, int, XT... args) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER(1);
LOAD_HELPER(2);
LOAD_HELPER(3);
LOAD_HELPER(4);
LOAD_HELPER(5);
LOAD_HELPER(6);
LOAD_HELPER(7);
LOAD_HELPER(8);
LOAD_HELPER(9);
LOAD_HELPER(10);
LOAD_HELPER(11);
LOAD_HELPER(12);
LOAD_HELPER(13);
LOAD_HELPER(14);
LOAD_HELPER(15);
LOAD_HELPER(16);
#undef LOAD_HELPER
#undef WEIGHT_CB
///////////////////////////c_dim = 1/////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2> \
struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> { \
static void impl(T& src, T2 ptr, int) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER(1);
LOAD_HELPER(2);
LOAD_HELPER(3);
LOAD_HELPER(4);
LOAD_HELPER(5);
LOAD_HELPER(6);
LOAD_HELPER(7);
LOAD_HELPER(8);
LOAD_HELPER(9);
#undef LOAD_HELPER
#undef WEIGHT_CB
/////////////////////////c_dim = 2///////////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2> \
struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> { \
static void impl(T& src, T2 ptr, int oc_offset) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER(1);
LOAD_HELPER(2);
LOAD_HELPER(3);
LOAD_HELPER(4);
LOAD_HELPER(5);
LOAD_HELPER(6);
LOAD_HELPER(7);
LOAD_HELPER(8);
#undef LOAD_HELPER
#undef WEIGHT_CB
template <int weight_number, int base_offset, int ptr_step, int c_dim,
typename Func, typename T, typename T2>
inline void load_helper(T& weight, T2 ptr, int oc_offset) {
LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl(
weight, ptr, oc_offset);
}
template <int weight_number, int base_offset, int ptr_step, int c_dim,
typename Func, typename T, typename T2, typename... XT>
inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2,
XT...>::impl(weight, ptr, oc_offset, args...);
}
} // namespace
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
#pragma once
/**
* \file dnn/src/arm_common/conv_bias/neon_struct.h
* \file dnn/src/arm_common/neon_struct.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
......@@ -10,6 +9,7 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/arm_common/simd_macro/marm_neon.h"
namespace megdnn {
namespace {
......@@ -62,4 +62,6 @@ struct Vfmaq_laneq_f32 {
};
} // namespace
} // namespace megdnn
\ No newline at end of file
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
......@@ -114,7 +114,13 @@ public:
bool usable(const PoolingKernSizeParam& param) const override;
void exec(const PoolingKernParam& param) const override;
};
class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase {
public:
bool is_reproducible() const override { return true; }
const char* name() const override { return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44"; }
bool usable(const PoolingKernSizeParam& param) const override;
void exec(const PoolingKernParam& param) const override;
};
WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param);
WorkspaceBundle get_bundle_nchw44(
......
/**
* \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megdnn/opr_param_defs.h"
#include "src/arm_common/pooling/algo.h"
#include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h"
#include "midout.h"
MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44)
namespace megdnn {
namespace arm_common {
bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable(
const PoolingKernSizeParam& param) const {
uint32_t sh = param.stride[0];
uint32_t sw = param.stride[1];
uint32_t fh = param.filter[0];
uint32_t fw = param.filter[1];
bool avaible = param.src_type.enumv() == DTypeEnum::Float32 &&
param.format == Param::Format::NCHW44 &&
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) &&
fh == fw && sh == sw &&
(fh == 2 || fh == 3 || fh == 4 || fh == 5) &&
(sh == 1 || sh == 2);
return avaible;
}
void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
const PoolingKernParam& param) const {
int ih = param.isz[0];
int iw = param.isz[1];
int oh = param.osz[0];
int ow = param.osz[1];
int n = param.n;
int ic = param.ic;
int ph = param.padding[0];
int pw = param.padding[1];
int sh = param.stride[0];
int fh = param.filter[0];
void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
#define DISPATCH_FUNC(filter, stride, mode) \
MIDOUT_BEGIN(megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0), \
midout_iv(#filter #stride #mode##_hash)) { \
auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, \
size_t) { \
const int c_idx = index; \
pooling_fp32_nchw44<filter, stride, mode>( \
static_cast<const float*>(src_ptr) + c_idx * ih * iw * 4, \
static_cast<float*>(dst_ptr) + c_idx * oh * ow * 4, ih, \
iw, oh, ow, ph, pw); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), \
n* ic, run); \
} \
MIDOUT_END();
#define DISPATCH_MODE(filter, stride) \
switch (param.mode) { \
case PoolingBase::Mode::MAX: \
DISPATCH_FUNC(filter, stride, PoolingBase::Mode::MAX); \
break; \
case PoolingBase::Mode::AVERAGE: \
DISPATCH_FUNC(filter, stride, PoolingBase::Mode::AVERAGE); \
break; \
default: \
megdnn_assert(0, "invalid mode %u", \
static_cast<uint32_t>(param.mode)); \
}
#define DISPATCH_STRIDE(filter) \
switch (sh) { \
case 1: \
DISPATCH_MODE(filter, 1); \
break; \
case 2: \
DISPATCH_MODE(filter, 2); \
break; \
default: \
megdnn_assert(0, "invalid stride %d", sh); \
}
#define DISPATCH_FILTER() \
switch (fh) { \
case 2: \
DISPATCH_STRIDE(2); \
break; \
case 3: \
DISPATCH_STRIDE(3); \
break; \
case 4: \
DISPATCH_STRIDE(4); \
break; \
case 5: \
DISPATCH_STRIDE(5); \
break; \
default: \
megdnn_assert(0, "invalid filter %d", fh); \
}
DISPATCH_FILTER()
#undef DISPATCH_FILTER
#undef DISPATCH_STRIDE
#undef DISPATCH_MODE
#undef DISPATCH_FUNC
}
} // namespace arm_common
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
/**
* \file dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <limits>
#include "megdnn/opr_param_defs.h"
#include "src/arm_common/intrinsic_helper.h"
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
namespace megdnn {
namespace arm_common {
namespace {
template <int filter, int stride, int ow_step, PoolingBase::Mode mode,
typename T1, typename T2>
struct CalXsXNchw44 {
static void impl(T1 result, T2 src);
};
template <int filter, int stride, int ow_step, PoolingBase::Mode mode,
typename T1, typename T2>
void calculate_xsx_nchw44(T1 result, T2 src) {
CalXsXNchw44<filter, stride, ow_step, mode, T1, T2>::impl(result, src);
};
#define CALCULATE_MAX_CB(step) \
result[0] = vmaxq_f32(result[0], src[0 * stride + step]); \
result[1] = vmaxq_f32(result[1], src[1 * stride + step]); \
result[2] = vmaxq_f32(result[2], src[2 * stride + step]); \
result[3] = vmaxq_f32(result[3], src[3 * stride + step]);
#define CALCULATE_AVG_CB(step) \
result[0] = vaddq_f32(result[0], src[0 * stride + step]); \
result[1] = vaddq_f32(result[1], src[1 * stride + step]); \
result[2] = vaddq_f32(result[2], src[2 * stride + step]); \
result[3] = vaddq_f32(result[3], src[3 * stride + step]);
#define INSTANCE_CAL(filter) \
template <int stride, typename T1, typename T2> \
struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> { \
static void impl(T1 result, T2 src) { \
UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB); \
} \
}; \
template <int stride, typename T1, typename T2> \
struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, \
T2> { \
static void impl(T1 result, T2 src) { \
UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB); \
} \
};
INSTANCE_CAL(2)
INSTANCE_CAL(3)
INSTANCE_CAL(4)
INSTANCE_CAL(5)
#undef INSTANCE_CAL
#undef CALCULATE_AVG_CB
#undef CALCULATE_MAX_CB
template <int filter, int stride, int ow_step, PoolingBase::Mode mode>
struct KerPoolingFilterXStrideXNchw44 {
static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw);
};
template <int filter, int stride, int ow_step>
struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step,
PoolingBase::Mode::MAX> {
static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
constexpr int src_reg_size = ow_step * stride + filter - stride;
constexpr int packed_ic = 4;
constexpr int simd_len = 4;
constexpr float default_float = std::numeric_limits<float>::lowest();
float32x4_t result[ow_step];
float32x4_t src[src_reg_size];
result[0] = vdupq_n_f32(default_float);
result[1] = vdupq_n_f32(default_float);
result[2] = vdupq_n_f32(default_float);
result[3] = vdupq_n_f32(default_float);
for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr + fh_idx * iw * packed_ic, 0);
calculate_xsx_nchw44<filter, stride, ow_step,
PoolingBase::Mode::MAX>(result, src);
}
vst1q_f32(dst_ptr + 0 * packed_ic, result[0]);
vst1q_f32(dst_ptr + 1 * packed_ic, result[1]);
vst1q_f32(dst_ptr + 2 * packed_ic, result[2]);
vst1q_f32(dst_ptr + 3 * packed_ic, result[3]);
}
};
template <int filter, int stride, int ow_step>
struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step,
PoolingBase::Mode::AVERAGE> {
static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
constexpr int src_reg_size = ow_step * stride + filter - stride;
constexpr int packed_ic = 4;
constexpr int simd_len = 4;
constexpr float default_float = 0;
constexpr float div_filter_size = 1.f / (filter * filter);
const float32x4_t div_filter_size_vec = vdupq_n_f32(div_filter_size);
float32x4_t result[ow_step];
float32x4_t src[src_reg_size];
result[0] = vdupq_n_f32(default_float);
result[1] = vdupq_n_f32(default_float);
result[2] = vdupq_n_f32(default_float);
result[3] = vdupq_n_f32(default_float);
for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr + fh_idx * iw * packed_ic, 0);
calculate_xsx_nchw44<filter, stride, ow_step,
PoolingBase::Mode::AVERAGE>(result, src);
}
result[0] = vmulq_f32(result[0], div_filter_size_vec);
result[1] = vmulq_f32(result[1], div_filter_size_vec);
result[2] = vmulq_f32(result[2], div_filter_size_vec);
result[3] = vmulq_f32(result[3], div_filter_size_vec);
vst1q_f32(dst_ptr + 0 * packed_ic, result[0]);
vst1q_f32(dst_ptr + 1 * packed_ic, result[1]);
vst1q_f32(dst_ptr + 2 * packed_ic, result[2]);
vst1q_f32(dst_ptr + 3 * packed_ic, result[3]);
}
};
template <PoolingBase::Mode mode>
void ker_pooling_nchw44_remain_pad(const float32_t* src_ptr, float32_t* dst_ptr,
const int iw, const int pad_top,
const int pad_bottom, const int pad_left,
const int pad_right, const int filter);
template <>
void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::MAX>(
const float32_t* src_ptr, float32_t* dst_ptr, const int iw,
const int pad_top, const int pad_bottom, const int pad_left,
const int pad_right, const int filter) {
constexpr int ic_step = 4;
const int ih_end = filter - pad_bottom;
const int iw_end = filter - pad_right;
float32x4_t result = vdupq_n_f32(std::numeric_limits<float>::lowest());
for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
float32x4_t src =
vld1q_f32(src_ptr + (iw_idx - pad_left) * ic_step);
result = vmaxq_f32(result, src);
}
src_ptr += iw * ic_step;
}
vst1q_f32(dst_ptr, result);
}
template <>
void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::AVERAGE>(
const float32_t* src_ptr, float32_t* dst_ptr, const int iw,
const int pad_top, const int pad_bottom, const int pad_left,
const int pad_right, const int filter) {
constexpr int ic_step = 4;
const int ih_end = filter - pad_bottom;
const int iw_end = filter - pad_right;
const float div_filter_size = 1.f / (filter * filter);
const float32x4_t div_filter_size_vec = vdupq_n_f32(div_filter_size);
float32x4_t result = vdupq_n_f32(0.f);
for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
float32x4_t src =
vld1q_f32(src_ptr + (iw_idx - pad_left) * ic_step);
result = vaddq_f32(result, src);
}
src_ptr += iw * ic_step;
}
result = vmulq_f32(result, div_filter_size_vec);
vst1q_f32(dst_ptr, result);
}
template <PoolingBase::Mode mode>
static inline void kern_pooling_with_pad_nchw44(
const float32_t* src, float32_t* dst, const int filter,
const int ow_start, const int ow_end, const int iw, const int ow,
const int stride_w, const int pw, const int real_ih_idx,
const int oh_idx, const int pad_top, const int pad_bottom) {
constexpr int ic_step = 4;
constexpr int oc_step = 4;
for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) {
const int iw_idx = ow_idx * stride_w;
const int real_iw_idx = std::max(iw_idx - pw, 0);
const int pad_left = std::max(0, pw - iw_idx);
const int pad_right = std::max(0, iw_idx - pw + filter - iw);
const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step;
const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
ker_pooling_nchw44_remain_pad<mode>(src + src_offset, dst + dst_offset,
iw, pad_top, pad_bottom, pad_left,
pad_right, filter);
}
}
template <int filter, int stride, PoolingBase::Mode mode>
static inline void pooling_fp32_nchw44_pad(const float32_t* src, float32_t* dst,
int ih, int iw, int oh, int ow,
int ph, int pw) {
constexpr int stride_h = stride;
constexpr int stride_w = stride;
constexpr int ic_step = 4;
constexpr int oc_step = 4;
constexpr int ow_step = 4;
const int ow_pad_left_end = div_ceil(pw, stride_w);
const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w;
const int ow_pad_right_step_end =
(ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step +
ow_pad_left_end;
rep(oh_idx, oh) {
const int ih_idx = oh_idx * stride_h;
const int real_ih_idx = std::max(ih_idx - ph, 0);
const int pad_top = std::max(0, ph - ih_idx);
const int pad_bottom = std::max(0, ih_idx - ph + filter - ih);
if (pad_top > 0 || pad_bottom > 0) {
kern_pooling_with_pad_nchw44<mode>(src, dst, filter, 0, ow, iw, ow,
stride_w, pw, real_ih_idx,
oh_idx, pad_top, pad_bottom);
} else {
kern_pooling_with_pad_nchw44<mode>(
src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw,
real_ih_idx, oh_idx, pad_top, pad_bottom);
for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end;
ow_idx += ow_step) {
const int iw_idx = ow_idx * stride_w;
const int real_iw_idx = std::max(iw_idx - pw, 0);
const int src_offset =
(real_ih_idx * iw + real_iw_idx) * ic_step;
const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
KerPoolingFilterXStrideXNchw44<filter, stride, ow_step,
mode>::impl(src + src_offset,
dst + dst_offset,
iw);
}
kern_pooling_with_pad_nchw44<mode>(
src, dst, filter, ow_pad_right_step_end, ow, iw, ow,
stride_w, pw, real_ih_idx, oh_idx, pad_top, pad_bottom);
}
}
}
template <int filter, int stride, PoolingBase::Mode mode>
static inline void pooling_fp32_nchw44_no_pad(const float32_t* src,
float32_t* dst, int, int iw,
int oh, int ow) {
constexpr int stride_h = stride;
constexpr int stride_w = stride;
constexpr int ic_step = 4;
constexpr int oc_step = 4;
constexpr int ow_step = 4;
const int ow_end = ow / ow_step * ow_step;
const int ow_remain = ow - ow_end;
rep(oh_idx, oh) {
const int ih_idx = oh_idx * stride_h;
const int src_ih_offset = ih_idx * iw;
const int dst_oh_offset = oh_idx * ow;
for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
const int iw_idx = ow_idx * stride_w;
const int src_offset = (src_ih_offset + iw_idx) * ic_step;
const int dst_offset = (dst_oh_offset + ow_idx) * oc_step;
KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl(
src + src_offset, dst + dst_offset, iw);
}
if (ow_remain > 0) {
kern_pooling_with_pad_nchw44<mode>(src, dst, filter, ow_end, ow, iw,
ow, stride_w, 0, ih_idx, oh_idx,
0, 0);
}
}
}
template <int filter, int stride, PoolingBase::Mode mode>
static inline void pooling_fp32_nchw44(const float32_t* src, float32_t* dst,
int ih, int iw, int oh, int ow, int ph,
int pw) {
if (ph > 0 || pw > 0) {
pooling_fp32_nchw44_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow,
ph, pw);
} else {
pooling_fp32_nchw44_no_pad<filter, stride, mode>(src, dst, ih, iw, oh,
ow);
}
}
} // namespace
} // namespace arm_common
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
......@@ -29,6 +29,7 @@ class PoolingImpl::AlgoPack : NonCopyableObj {
AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4;
AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4;
AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4;
AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44;
public:
AlgoPack() {
......@@ -44,6 +45,7 @@ public:
all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4);
all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4);
all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4);
all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44);
}
SmallVector<AlgoBase*> all_algos;
};
......
......@@ -87,10 +87,10 @@ private:
class AlgoFilter3ModexStridexNCHW44;
class AlgoFilter4ModexStridexNCHW44;
class AlgoFilter5ModexStridexNCHW44;
class AlgoFp32ModexStridexNCHW44;
class AlgoPack;
};
} // namespace arm_common
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
......@@ -32,10 +33,33 @@
#define UNROLL_RAW9(cb, v0, a...) \
UNROLL_RAW8(cb, v0, ##a) \
cb(8, ##a)
#define UNROLL_RAW10(cb, v0, a...) \
UNROLL_RAW9(cb, v0, ##a) \
cb(9, ##a)
#define UNROLL_RAW11(cb, v0, a...) \
UNROLL_RAW10(cb, v0, ##a) \
cb(10, ##a)
#define UNROLL_RAW12(cb, v0, a...) \
UNROLL_RAW11(cb, v0, ##a) \
cb(11, ##a)
#define UNROLL_RAW13(cb, v0, a...) \
UNROLL_RAW12(cb, v0, ##a) \
cb(12, ##a)
#define UNROLL_RAW14(cb, v0, a...) \
UNROLL_RAW13(cb, v0, ##a) \
cb(13, ##a)
#define UNROLL_RAW15(cb, v0, a...) \
UNROLL_RAW14(cb, v0, ##a) \
cb(14, ##a)
// clang-format off
#define UNROLL_RAW16(cb, v0, a...) \
UNROLL_RAW8(cb, v0, ##a) \
cb(8, ##a) cb(9, ##a) cb(10, ##a) cb(11, ##a) cb(12, ##a) cb(13, ##a) \
cb(14, ##a) cb(15, ##a)
#define UNROLL_RAW17(cb, v0, a...) \
UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a)
#define UNROLL_RAW24(cb, v0, a...) \
UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \
......@@ -130,4 +154,6 @@
#define UNROLL_CALL_NOWRAPPER_D2(step, step2, cb) \
UNROLL_CALL_RAW_D2(step, step2, cb)
// clang-format on
// vim: syntax=cpp.doxygen
......@@ -256,6 +256,73 @@ TEST_F(ARM_COMMON, POOLING_QUANTIZED) {
}
#if MEGDNN_WITH_BENCHMARK
void benchmark_nchw44_fp32(Handle *handle) {
using Param = param::Pooling;
auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter,
size_t stride, size_t pad, Param::Mode mode) {
Param param;
param.window_h = param.window_w = filter;
param.stride_h = param.stride_w = stride;
param.pad_h = param.pad_w = pad;
param.format = Param::Format::NCHW;
param.mode = mode;
TensorShape nchw_shape = {n, c, h, w};
TensorShape nchw44_shape = {n, c / 4, h, w, 4};
TensorLayout dst_layout;
auto opr = handle->create_operator<Pooling>();
opr->param() = param;
opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout);
float calc_amount =
dst_layout.total_nr_elems() * param.window_h * param.window_w;
Benchmarker<Pooling> benchmarker_float_nchw(handle);
Benchmarker<Pooling> benchmarker_float_nchw44(handle);
Benchmarker<Pooling> benchmarker_int_nchw44(handle);
size_t RUN = 500;
auto t1 = benchmarker_float_nchw.set_display(false)
.set_times(RUN)
.set_param(param)
.exec({nchw_shape, {}});
param.format = Param::Format::NCHW44;
auto t2 = benchmarker_int_nchw44.set_display(false)
.set_times(RUN)
.set_param(param)
.execl({{nchw44_shape, dtype::QuantizedS8(1.0)},
{{}, dtype::QuantizedS8(1.0)}});
auto t3 = benchmarker_float_nchw44.set_display(false)
.set_times(RUN)
.set_param(param)
.exec({nchw44_shape, {}});
printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n"
"nchw_fp32={%.3f ms, %.3f Mflops}, "
"nchw44_int={%.3f ms, %.3f Mflops}, "
"nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n",
n, c, h, w, filter, stride, pad, t1 / RUN,
calc_amount / (t1 / RUN * 1000), t2 / RUN,
calc_amount / (t2 / RUN * 1000), t3 / RUN,
calc_amount / (t3 / RUN * 1000), t1 / t3);
};
// Resnet50
run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX);
run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE);
// VGG16
run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX);
run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX);
run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX);
run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX);
run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX);
}
TEST_F(ARM_COMMON, BENCHMARK_POOLING_NCHW44_FP32) { benchmark_nchw44_fp32(handle()); }
TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_POOLING_NCHW44_FP32) {
benchmark_nchw44_fp32(handle());
}
TEST_F(ARM_COMMON, BENCHMARK_POOLING_INT8_W3x3_S2x2)
{
using Param = param::Pooling;
......
......@@ -57,6 +57,65 @@ TEST_F(ARM_COMMON_MULTI_THREADS, POOLING) {
}
}
std::vector<std::pair<param::Pooling, TensorShapeArray>> get_nchw44_pool_args(
size_t filter, size_t stride) {
constexpr size_t ic_step = 4;
std::vector<std::pair<param::Pooling, TensorShapeArray>> args;
for (size_t n : {1, 2})
for (size_t c : {4, 8})
for (size_t ih : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13})
for (size_t iw : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13})
for (size_t ph : {0, 1, 2})
for (size_t pw : {0, 1, 2})
for (auto mode : {param::Pooling::Mode::MAX,
param::Pooling::Mode::AVERAGE})
if (ih + 2 * ph >= filter &&
iw + 2 * pw >= filter && filter > ph &&
filter > pw) {
param::Pooling param;
param.mode = mode;
param.format =
param::Pooling::Format::NCHW44;
param.pad_h = ph;
param.pad_w = pw;
param.stride_h = param.stride_w = stride;
param.window_h = param.window_w = filter;
args.emplace_back(std::make_pair(
param,
TensorShapeArray{{n, c / ic_step,
ih, iw, ic_step},
{}}));
}
return args;
}
void run_pooling_check(
Handle* handle,
std::vector<std::pair<param::Pooling, TensorShapeArray>> args,
bool is_int8) {
Checker<Pooling> checker(handle);
UniformIntRNG rng_int8{INT8_MIN >> 1, INT8_MAX >> 1};
UniformIntRNG rng_fp32{-10, 10};
if (is_int8) {
checker.set_dtype(0, dtype::QuantizedS8(1.1f));
checker.set_rng(0, &rng_int8);
} else {
checker.set_rng(0, &rng_fp32);
}
for (auto arg : args) {
checker.set_param(arg.first).exec(arg.second);
}
}
TEST_F(ARM_COMMON_MULTI_THREADS, POOLING_NCHW44_FP32) {
for (auto filter : {2, 3, 4, 5})
for (auto stride : {1, 2}) {
run_pooling_check(handle(), get_nchw44_pool_args(filter, stride),
false);
}
}
TEST_F(ARM_COMMON_MULTI_THREADS, POOLING_W3x3_NCHW44)
{
// clang-format off
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册