提交 fa59a7b0 编写于 作者: M Megvii Engine Team

feat(x86/rvv): opt AlgoF32DirectNCHWNCHW44

and opt GiMaximumFloat32/GiMinimumFloat32 on x86

GitOrigin-RevId: 825021e867ccf6314aac5a6c4c61df13d1e71705
上级 0d82e9b7
......@@ -748,7 +748,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
GI_FLOAT32_FIXLEN_t src_v[2][5];
#define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \
load_vec<5>(kernel0, filter + i * 5 * 4); \
load_vec<6>(src, input + i * IW * 4); \
load_vec<5>(src, input + i * IW * 4); \
compute_vec<5>(dst[0][0], &src[0], kernel0); \
compute_vec<5>(dst[1][0], &src[0], kernel1);
// line 0
......@@ -813,7 +813,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
GI_FLOAT32_FIXLEN_t src_v[2][5];
#define COMPUTE_5X5_1(i, dst, src, kernel) \
load_vec<5>(kernel, filter + i * 5 * 4); \
load_vec<6>(src, input + i * IW * 4); \
load_vec<5>(src, input + i * IW * 4); \
compute_vec<5>(dst, &src[0], kernel)
// line 0
COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);
......@@ -1148,7 +1148,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5(
GI_FLOAT32_FIXLEN_t src_v[2][5];
#define COMPUTE_5X5_1(i, dst, src, kernel) \
load_vec<5>(kernel, filter + i * 5 * 4); \
load_vec<6>(src, input + i * IW * 4); \
load_vec<5>(src, input + i * IW * 4); \
compute_vec<5>(dst, &src[0], kernel)
// line 0
COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);
......
......@@ -37,6 +37,26 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA GiMultiplyAddScalarFloat32
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \
c[1][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#else
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
......@@ -55,6 +75,8 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4));
#undef MLA
#endif
#define SHIFT_CAL_HELPER(ow_remain) \
template < \
......@@ -151,23 +173,38 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride, ow_
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[src_reg_size];
#else
GI_FLOAT32_FIXLEN_t src[src_reg_size];
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
#define KERNEL_CB(step) \
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
#define SRC_LOAD(step) \
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0)
#else
#define SRC_LOAD(step) \
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0)
#endif
#define KERNEL_CB(step) \
SRC_LOAD(step); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight);
UNROLL_CALL_RAW(7, KERNEL_CB)
#undef KERNEL_CB
#undef SRC_LOAD
src_ptr += ld_src_ic;
weight_ptr += ld_weight_ic;
......@@ -200,20 +237,33 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride, ow_
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[src_reg_size];
#else
GI_FLOAT32_FIXLEN_t src[src_reg_size];
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
#define KERNEL_CB(step) \
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
#define SRC_LOAD(step) \
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0);
#else
#define SRC_LOAD(step) \
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0);
#endif
#define KERNEL_CB(step) \
SRC_LOAD(step); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);
UNROLL_CALL_RAW(5, KERNEL_CB)
#undef KERNEL_CB
#undef SRC_LOAD
src_ptr += ld_src_ic;
weight_ptr += ld_weight_ic;
......@@ -246,10 +296,18 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[src_reg_size];
#else
GI_FLOAT32_FIXLEN_t src[src_reg_size];
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
// row 0
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
#else
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
......@@ -257,7 +315,11 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);
// row 1
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
#else
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
#endif
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
......@@ -265,8 +327,12 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);
// row 2
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + 2 * iw, 0);
#else
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(
src, src_ptr + 2 * iw, 0);
#endif
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
......@@ -637,17 +703,29 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride, ow_
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[src_reg_size];
#else
GI_FLOAT32_FIXLEN_t src[src_reg_size];
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
// row 0
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
#else
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
// row 1
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
#else
load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
#endif
load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
......@@ -670,7 +748,7 @@ struct ConvDirectFp32NchwNchw44 {
constexpr int fh = filter_size;
constexpr int fw = filter_size;
constexpr int ic_step = 1;
#if MEGDNN_ARMV7
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) || defined(MEGDNN_ARMV7)
constexpr int big_oc_step = 4;
#else
constexpr int big_oc_step = 8;
......
......@@ -62,6 +62,13 @@ struct LoadHelper {
static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
};
template <
int weight_number, int base_offset, int ptr_step, int oc_block, typename T,
typename T2>
struct LoadPtrHelper {
static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset);
};
#define WEIGHT_CB(step) \
src[step] = GiFloat32Type2FixLenType( \
Func::impl(ptr + base_offset + step * ptr_step, args...));
......@@ -96,6 +103,36 @@ LOAD_HELPER(16);
#undef LOAD_HELPER
#undef WEIGHT_CB
#define WEIGHT_PTR_CB(step) src[step] = ptr + base_offset + step * ptr_step;
#define LOAD_PTR_HELPER(step) \
template <int base_offset, int ptr_step, typename T, typename T2> \
struct LoadPtrHelper<step, base_offset, ptr_step, 0, T, T2> { \
static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \
UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \
} \
}
LOAD_PTR_HELPER(1);
LOAD_PTR_HELPER(2);
LOAD_PTR_HELPER(3);
LOAD_PTR_HELPER(4);
LOAD_PTR_HELPER(5);
LOAD_PTR_HELPER(6);
LOAD_PTR_HELPER(7);
LOAD_PTR_HELPER(8);
LOAD_PTR_HELPER(9);
LOAD_PTR_HELPER(10);
LOAD_PTR_HELPER(11);
LOAD_PTR_HELPER(12);
LOAD_PTR_HELPER(13);
LOAD_PTR_HELPER(14);
LOAD_PTR_HELPER(15);
LOAD_PTR_HELPER(16);
#undef LOAD_PTR_HELPER
#undef WEIGHT_PTR_CB
///////////////////////////c_dim = 1/////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = \
......@@ -122,6 +159,29 @@ LOAD_HELPER(9);
#undef LOAD_HELPER
#undef WEIGHT_CB
#define WEIGHT_PTR_CB(step) src[0][step] = ptr + base_offset + step * ptr_step;
#define LOAD_PTR_HELPER(step) \
template <int base_offset, int ptr_step, typename T, typename T2> \
struct LoadPtrHelper<step, base_offset, ptr_step, 1, T, T2> { \
static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \
UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \
} \
}
LOAD_PTR_HELPER(1);
LOAD_PTR_HELPER(2);
LOAD_PTR_HELPER(3);
LOAD_PTR_HELPER(4);
LOAD_PTR_HELPER(5);
LOAD_PTR_HELPER(6);
LOAD_PTR_HELPER(7);
LOAD_PTR_HELPER(8);
LOAD_PTR_HELPER(9);
#undef LOAD_PTR_HELPER
#undef WEIGHT_PTR_CB
/////////////////////////c_dim = 2///////////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = \
......@@ -149,6 +209,30 @@ LOAD_HELPER(8);
#undef LOAD_HELPER
#undef WEIGHT_CB
#define WEIGHT_PTR_CB(step) \
src[0][step] = ptr + base_offset + step * ptr_step; \
src[1][step] = ptr + base_offset + step * ptr_step + oc_offset;
#define LOAD_PTR_HELPER(step) \
template <int base_offset, int ptr_step, typename T, typename T2> \
struct LoadPtrHelper<step, base_offset, ptr_step, 2, T, T2> { \
static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \
UNROLL_CALL_RAW(step, WEIGHT_PTR_CB); \
} \
}
LOAD_PTR_HELPER(1);
LOAD_PTR_HELPER(2);
LOAD_PTR_HELPER(3);
LOAD_PTR_HELPER(4);
LOAD_PTR_HELPER(5);
LOAD_PTR_HELPER(6);
LOAD_PTR_HELPER(7);
LOAD_PTR_HELPER(8);
#undef LOAD_HELPER
#undef WEIGHT_PTR_CB
template <
int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
typename T, typename T2>
......@@ -157,6 +241,14 @@ GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) {
weight, ptr, oc_offset);
}
template <
int weight_number, int base_offset, int ptr_step, int c_dim, typename T,
typename T2>
GI_FORCEINLINE void load_ptr_helper(T& weight, T2 ptr, int oc_offset) {
LoadPtrHelper<weight_number, base_offset, ptr_step, c_dim, T, T2>::impl(
weight, ptr, oc_offset);
}
////////////////////Store_OCX_OW8_Remain/////////////////////////
template <int c_dim, int ow_remain, typename Op, typename T, typename T2, typename T3>
struct StoreOcxOw8Remain {
......
......@@ -1110,7 +1110,7 @@ GI_FORCEINLINE
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
#elif defined(GI_SSE2_INTRINSICS)
return _mm_max_ps(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
......@@ -1127,7 +1127,7 @@ GI_FORCEINLINE
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
#elif defined(GI_SSE2_INTRINSICS)
return _mm_min_ps(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册