diff --git a/CMakeLists.txt b/CMakeLists.txt index 7405ee25055c37a67762291dc6409c6a44214385..f773219646884b09bf1202ad83c0c7590364cb03 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,8 +128,11 @@ else() set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") if(ANDROID) set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -DNDEBUG -g") + else() set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -DNDEBUG -g") endif() endif() diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h index bc912d64298fa934e0fefa816974822b0de324ac..0eef23510f8be17fef9344bb0147b266c400eb76 100644 --- a/dnn/include/megdnn/arch.h +++ b/dnn/include/megdnn/arch.h @@ -29,6 +29,13 @@ #define megdnn_likely(v) __builtin_expect(bool(v), 1) #define megdnn_unlikely(v) __builtin_expect(bool(v), 0) +#if !defined(__clang__) && MEGDNN_ARMV7 && !defined(NDEBUG) +//! Thumb2 limit code length +#define MEGDNN_ALWAYS_INLINE +#else +#define MEGDNN_ALWAYS_INLINE inline __attribute__((__always_inline__)) +#endif + #define MEGDNN_DEPRECATED __attribute__((deprecated)) #define MEGDNN_PACKED __attribute__((packed)) #define MEGDNN_CONSTEXPR constexpr diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h index 205855f55560193fd8c7c5c8871fed96bea5bdce..70b7a1de3e7384f22107c67b7478e426b6320912 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h @@ -10,6 +10,7 @@ * implied. */ #pragma once +#include "megdnn/arch.h" #include "src/arm_common/conv_bias/intrinsic_helper.h" #include "src/arm_common/conv_bias/opr_impl.h" #include "src/arm_common/elemwise_op.h" @@ -17,7 +18,6 @@ #include "src/common/unroll_macro.h" #include "src/common/utils.h" #include "src/fallback/conv_bias/common.h" - namespace megdnn { namespace arm_common { namespace { @@ -32,13 +32,13 @@ namespace { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); }; template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step) \ c[0][step] = Func::template impl<(step * stride + src_idx) % 4>( \ c[0][step], weight[0][weight_idx], \ @@ -54,7 +54,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step) \ c[0][step] = Func::template impl<(step * stride + src_idx) % 4>( \ c[0][step], weight[0][weight_idx], \ @@ -67,7 +67,7 @@ struct ShiftCalHelper { template -inline void cal_helper(T& c, T2& src, T3& weight) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { ShiftCalHelper::impl( c, src, weight); }; diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp index 9daf67a1ce6a1c1838da010ba777bf1c9fa2e818..ba8af1efc0fd0341a42e82d0f07f712958644da3 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp @@ -11,6 +11,7 @@ * implied. */ +#include "megdnn/arch.h" #include "src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h" #include "src/arm_common/conv_bias/intrinsic_helper.h" #include "src/arm_common/elemwise_op.h" @@ -26,13 +27,13 @@ namespace { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); }; template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 8]); \ @@ -49,7 +50,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 4]); \ @@ -66,7 +67,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 8]); @@ -81,7 +82,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 4]); @@ -96,7 +97,7 @@ struct ShiftCalHelper { template -inline void cal_helper(T& c, T2& src, T3& weight) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { ShiftCalHelper::impl(c, src, weight); }; diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp index 2b2b73d8c11ec737122ebc47929f30b82143ed76..78019dd6b8c391499cbbc871137c436ae64e0be8 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp @@ -11,6 +11,7 @@ * implied. */ +#include "megdnn/arch.h" #include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h" #include "src/arm_common/conv_bias/intrinsic_helper.h" #include "src/arm_common/elemwise_op.h" @@ -26,13 +27,13 @@ namespace { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); }; template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 8]); \ @@ -49,7 +50,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 4]); \ @@ -66,7 +67,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 8]); @@ -81,7 +82,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { #define cb(step, lane) \ c[0][step] = Func::template impl(c[0][step], weight[0][lane], \ src[(step + src_idx) % 4]); @@ -96,7 +97,7 @@ struct ShiftCalHelper { template -inline void cal_helper(T& c, T2& src, T3& weight) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { ShiftCalHelper::impl(c, src, weight); }; @@ -462,9 +463,10 @@ inline void odd_even_split_iw8_even(float* sptr_base, const float* sptr, vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[5]); vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[7]); } -void odd_even_split_iw8_odd(float* sptr_base, const float* sptr, - const int odd_start, const int src_idx, - const int iw_idx) { + +inline void odd_even_split_iw8_odd(float* sptr_base, const float* sptr, + const int odd_start, const int src_idx, + const int iw_idx) { constexpr int ic_step = 4; const int src_offset = src_idx * ic_step; const int even_offset = (iw_idx + 1) / 2 * ic_step; diff --git a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h index 4b57156f950b7fec5fedb00008686e93c0756072..3d9e9a0865ee55062688738555e8538025ddec52 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h +++ b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h @@ -5,11 +5,13 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ - +#pragma once #ifdef __ARM_FEATURE_DOTPROD +#include "megdnn/arch.h" #include "src/arm_common/conv_bias/intrinsic_helper.h" #include "src/arm_common/elemwise_op.h" #include "src/arm_common/intrinsic_helper.h" @@ -27,8 +29,8 @@ constexpr int filter_next_col = IC_PACK_SIZE * OC_PACK_SIZE; //! [OC/4, IC/4, FH, FW, 4OC, 4IC] template -inline void init_ocx_ow8(int32x4_t c[][8], const int32_t* bias_ptr, - int oc_step) { +MEGDNN_ALWAYS_INLINE void init_ocx_ow8(int32x4_t c[][8], + const int32_t* bias_ptr, int oc_step) { static_assert(row == 1 || row == 2 || row == 3, "Invalid OC number."); if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { #define BIAS_INIT(step, i) c[i][step] = vld1q_s32(bias_ptr + i * oc_step); @@ -90,12 +92,13 @@ inline void init_ocx_ow8(int32x4_t c[][8], const int32_t* bias_ptr, template struct StoreOCxOWx { - static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr, - const int ld_dst_oc); + static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op, + T* dst_ptr, const int ld_dst_oc); }; template struct StoreOCxOWx<1, ow_remain, Op, T> { + static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr, const int ld_dst_oc) { MEGDNN_MARK_USED_VAR(ld_dst_oc); @@ -128,8 +131,8 @@ struct StoreOCxOWx<1, ow_remain, Op, T> { template struct StoreOCxOWx<2, ow_remain, Op, T> { - static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr, - const int ld_dst_oc) { + static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op, + T* dst_ptr, const int ld_dst_oc) { switch (ow_remain) { case 8: UNROLL_CALL_RAW(4, cb22); @@ -159,8 +162,8 @@ struct StoreOCxOWx<2, ow_remain, Op, T> { template struct StoreOCxOWx<3, ow_remain, Op, T> { - static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr, - const int ld_dst_oc) { + static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op, + T* dst_ptr, const int ld_dst_oc) { switch (ow_remain) { case 8: UNROLL_CALL_RAW(4, cb32); @@ -196,15 +199,16 @@ struct StoreOCxOWx<3, ow_remain, Op, T> { #undef cb32 template -inline void store_ocx_owx_remain_static(int32x4_t res[][8], const Op& op, - T* dst_ptr, const int ld_dst_oc) { +MEGDNN_ALWAYS_INLINE void store_ocx_owx_remain_static(int32x4_t res[][8], + const Op& op, T* dst_ptr, + const int ld_dst_oc) { StoreOCxOWx::impl(res, op, dst_ptr, ld_dst_oc); } template struct ShiftCalHelper { - static void impl(T& res, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& res, T2& src, T3& weight) { #define cb(step) \ res[res_row][step] = FUNC::template impl<((src_start_idx + step) % 4)>( \ res[res_row][step], weight[weight_idx], \ @@ -216,7 +220,7 @@ struct ShiftCalHelper { template -inline void cal_helper(T& res, T2& src, T3& weight) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& res, T2& src, T3& weight) { ShiftCalHelper::impl(res, src, weight); }; @@ -428,4 +432,4 @@ struct KernNeonSdotNCHW44 struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight, T4& temp); - static void impl(T& c, T2& src, T3& weight); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); }; template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight, T4& temp) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0], temp[0]); c[1][0] = Func::impl(src[0 + src_idx], weight[1][weight_idx], c[1][0], @@ -61,7 +62,7 @@ struct ShiftCalHelper { c[1][3] = Func::impl(src[3 + src_idx], weight[1][weight_idx], c[1][3], temp[3]); } - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0]); c[1][0] = Func::impl(src[0 + src_idx], weight[1][weight_idx], c[1][0]); c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1]); @@ -75,7 +76,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight, T4& temp) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0], temp[0]); c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1], @@ -85,7 +86,7 @@ struct ShiftCalHelper { c[0][3] = Func::impl(src[3 + src_idx], weight[0][weight_idx], c[0][3], temp[2]); } - static void impl(T& c, T2& src, T3& weight) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0]); c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1]); c[0][2] = Func::impl(src[2 + src_idx], weight[0][weight_idx], c[0][2]); @@ -96,7 +97,7 @@ struct ShiftCalHelper { template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight, T4& temp) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { c[0][0] = Func::impl(src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); c[1][0] = Func::impl(src[(0 + src_idx) % 8], weight[1][weight_idx], @@ -131,12 +132,12 @@ struct ShiftCalHelper { c[1][7] = Func::impl(src[(7 + src_idx) % 8], weight[1][weight_idx], c[1][7], temp[3]); } - static void impl(T&, T2&, T3&); + static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); }; template struct ShiftCalHelper { - static void impl(T& c, T2& src, T3& weight, T4& temp) { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { c[0][0] = Func::impl(src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); c[0][1] = Func::impl(src[(1 + src_idx) % 8], weight[0][weight_idx], @@ -154,18 +155,18 @@ struct ShiftCalHelper { c[0][7] = Func::impl(src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[3]); } - static void impl(T&, T2&, T3&); + static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); }; template -inline void cal_helper(T& c, T2& src, T3& weight, T4& temp) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) { ShiftCalHelper::impl(c, src, weight, temp); } template -inline void cal_helper(T& c, T2& src, T3& weight) { +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { ShiftCalHelper::impl(c, src, weight); }; @@ -703,8 +704,9 @@ struct KerNeonXXs2NchwNchw44 { enum PACK_MODE { NO_PAD = 0, FIRST_PAD = 1, LAST_PAD = 2 }; template -inline void pack_src_one_line(const int8_t* inptr, int8_t* outptr, int left_pad, - int right_pad, const int iw) { +MEGDNN_ALWAYS_INLINE void pack_src_one_line(const int8_t* inptr, int8_t* outptr, + int left_pad, int right_pad, + const int iw) { const int8_t* src_row_0 = inptr; const int8_t* src_row_1 = inptr + iw; constexpr int combine_row = 2; @@ -1235,6 +1237,7 @@ struct ConvDiectStrideInt8NchwNchw44 { } } } + if (oc_remain > 0) { size_t oc_idx = oc_end; const size_t weight_offset = oc_idx * ic * fh * fw; @@ -1284,4 +1287,5 @@ static void conv_direct_int8_nchw_nchw44(const int8_t* src, } // namespace } // namespace arm_common } // namespace megdnn - // vim: syntax=cpp.doxygen \ No newline at end of file + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/arm_common/conv_bias/intrinsic_helper.h b/dnn/src/arm_common/conv_bias/intrinsic_helper.h index 67e2d3f7747094763635a02037dc1e77969cd346..e27f8e2946b3b08de8201d11a76bff9398bec828 100644 --- a/dnn/src/arm_common/conv_bias/intrinsic_helper.h +++ b/dnn/src/arm_common/conv_bias/intrinsic_helper.h @@ -15,18 +15,20 @@ #include "src/arm_common/simd_macro/marm_neon.h" #include "src/common/unroll_macro.h" #include "src/fallback/conv_bias/common.h" + +#define __ai inline __attribute__((__always_inline__)) namespace megdnn { namespace { ////////////////////Store_OC4_OW8_Remain///////////////////////// template struct Store_OC4_OW8_Remain { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr); + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr); }; template struct Store_OC4_OW8_Remain<0, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op({{c[2], c[3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[4], c[5]}}, reinterpret_cast(dst_ptr + 16)); @@ -36,7 +38,7 @@ struct Store_OC4_OW8_Remain<0, Op> { template struct Store_OC4_OW8_Remain<7, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op({{c[2], c[3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[4], c[5]}}, reinterpret_cast(dst_ptr + 16)); @@ -45,7 +47,7 @@ struct Store_OC4_OW8_Remain<7, Op> { }; template struct Store_OC4_OW8_Remain<6, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op({{c[2], c[3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[4], c[5]}}, reinterpret_cast(dst_ptr + 16)); @@ -53,7 +55,7 @@ struct Store_OC4_OW8_Remain<6, Op> { }; template struct Store_OC4_OW8_Remain<5, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op({{c[2], c[3]}}, reinterpret_cast(dst_ptr + 8)); op(c[4], reinterpret_cast(dst_ptr + 16)); @@ -61,46 +63,46 @@ struct Store_OC4_OW8_Remain<5, Op> { }; template struct Store_OC4_OW8_Remain<4, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op({{c[2], c[3]}}, reinterpret_cast(dst_ptr + 8)); } }; template struct Store_OC4_OW8_Remain<3, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); op(c[2], reinterpret_cast(dst_ptr + 8)); } }; template struct Store_OC4_OW8_Remain<2, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op({{c[0], c[1]}}, reinterpret_cast(dst_ptr)); } }; template struct Store_OC4_OW8_Remain<1, Op> { - static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { + static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) { op(c[0], reinterpret_cast(dst_ptr)); } }; template -inline void store_oc4_ow8_remain_static(int32x4_t c[8], const Op& op, - int8_t* dst_ptr) { +__ai void store_oc4_ow8_remain_static(int32x4_t c[8], const Op& op, + int8_t* dst_ptr) { Store_OC4_OW8_Remain::impl(c, op, dst_ptr); } template struct StoreOcxOw4Remain { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc); + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc); }; template struct StoreOcxOw4Remain<2, 0, Op, T> { - static void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); @@ -113,7 +115,7 @@ struct StoreOcxOw4Remain<2, 0, Op, T> { template struct StoreOcxOw4Remain<2, 3, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op(c[0][2], reinterpret_cast(dst_ptr + 8)); @@ -124,7 +126,7 @@ struct StoreOcxOw4Remain<2, 3, Op, T> { }; template struct StoreOcxOw4Remain<2, 2, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); @@ -132,7 +134,7 @@ struct StoreOcxOw4Remain<2, 2, Op, T> { }; template struct StoreOcxOw4Remain<2, 1, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { op(c[0][0], reinterpret_cast(dst_ptr)); op(c[1][0], reinterpret_cast(dst_ptr + ld_dst_oc)); } @@ -140,8 +142,8 @@ struct StoreOcxOw4Remain<2, 1, Op, T> { template struct StoreOcxOw4Remain<1, 0, Op, T> { - static void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { MEGDNN_MARK_USED_VAR(ld_dst_oc); op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); @@ -150,7 +152,7 @@ struct StoreOcxOw4Remain<1, 0, Op, T> { template struct StoreOcxOw4Remain<1, 3, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { MEGDNN_MARK_USED_VAR(ld_dst_oc); op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op(c[0][2], reinterpret_cast(dst_ptr + 8)); @@ -158,33 +160,33 @@ struct StoreOcxOw4Remain<1, 3, Op, T> { }; template struct StoreOcxOw4Remain<1, 2, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { MEGDNN_MARK_USED_VAR(ld_dst_oc); op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); } }; template struct StoreOcxOw4Remain<1, 1, Op, T> { - static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) { MEGDNN_MARK_USED_VAR(ld_dst_oc); op(c[0][0], reinterpret_cast(dst_ptr)); } }; template -inline void store_ocx_ow4_remain_static(T& c, const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { +__ai void store_ocx_ow4_remain_static(T& c, const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { StoreOcxOw4Remain::impl(c, op, dst_ptr, ld_dst_oc); } ////////////////////Store_OCX_OW8_Remain///////////////////////// template struct StoreOcxOw8Remain { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc); + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc); }; template struct StoreOcxOw8Remain<2, 0, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -200,7 +202,7 @@ struct StoreOcxOw8Remain<2, 0, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 8, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -216,7 +218,7 @@ struct StoreOcxOw8Remain<2, 8, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 7, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -231,7 +233,7 @@ struct StoreOcxOw8Remain<2, 7, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 6, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -244,7 +246,7 @@ struct StoreOcxOw8Remain<2, 6, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 5, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op(c[0][4], reinterpret_cast(dst_ptr + 16)); @@ -256,7 +258,7 @@ struct StoreOcxOw8Remain<2, 5, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 4, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); @@ -266,7 +268,7 @@ struct StoreOcxOw8Remain<2, 4, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 3, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op(c[0][2], reinterpret_cast(dst_ptr + 8)); @@ -276,14 +278,14 @@ struct StoreOcxOw8Remain<2, 3, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<2, 2, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); } }; template struct StoreOcxOw8Remain<2, 1, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { op(c[0][0], reinterpret_cast(dst_ptr)); op(c[1][0], reinterpret_cast(dst_ptr + ld_dst_oc)); } @@ -291,7 +293,7 @@ struct StoreOcxOw8Remain<2, 1, Op, T, T2, T3> { template struct StoreOcxOw8Remain<1, 0, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -300,7 +302,7 @@ struct StoreOcxOw8Remain<1, 0, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<1, 8, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -310,7 +312,7 @@ struct StoreOcxOw8Remain<1, 8, Op, T, T2, T3> { template struct StoreOcxOw8Remain<1, 7, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -319,7 +321,7 @@ struct StoreOcxOw8Remain<1, 7, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<1, 6, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -327,7 +329,7 @@ struct StoreOcxOw8Remain<1, 6, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<1, 5, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op(c[0][4], reinterpret_cast(dst_ptr + 16)); @@ -335,41 +337,41 @@ struct StoreOcxOw8Remain<1, 5, Op, T, T2, T3> { }; template struct StoreOcxOw8Remain<1, 4, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); } }; template struct StoreOcxOw8Remain<1, 3, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op(c[0][2], reinterpret_cast(dst_ptr + 8)); } }; template struct StoreOcxOw8Remain<1, 2, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); } }; template struct StoreOcxOw8Remain<1, 1, Op, T, T2, T3> { - static void impl(T& c, const Op& op, T2 dst_ptr, int) { + static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) { op(c[0][0], reinterpret_cast(dst_ptr)); } }; template -inline void store_ocx_ow8_remain_static(T& c, const Op& op, T2 dst_ptr, - int ld_dst_oc) { +__ai void store_ocx_ow8_remain_static(T& c, const Op& op, T2 dst_ptr, + int ld_dst_oc) { StoreOcxOw8Remain::impl(c, op, dst_ptr, ld_dst_oc); } template -inline void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr, - int ld_dst_oc) { +__ai void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr, + int ld_dst_oc) { StoreOcxOw8Remain::impl(c, op, dst_ptr, ld_dst_oc); } @@ -377,14 +379,14 @@ inline void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr, template struct Store_OC8_OW8_Remain { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc); + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc); }; template struct Store_OC8_OW8_Remain<0, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -403,8 +405,8 @@ struct Store_OC8_OW8_Remain<0, Op> { template struct Store_OC8_OW8_Remain<7, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -422,8 +424,8 @@ struct Store_OC8_OW8_Remain<7, Op> { template struct Store_OC8_OW8_Remain<6, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); @@ -439,8 +441,8 @@ struct Store_OC8_OW8_Remain<6, Op> { template struct Store_OC8_OW8_Remain<5, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); op(c[0][4], reinterpret_cast(dst_ptr + 16)); @@ -455,8 +457,8 @@ struct Store_OC8_OW8_Remain<5, Op> { template struct Store_OC8_OW8_Remain<4, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); @@ -469,8 +471,8 @@ struct Store_OC8_OW8_Remain<4, Op> { template struct Store_OC8_OW8_Remain<3, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op(c[0][2], reinterpret_cast(dst_ptr + 8)); @@ -481,8 +483,8 @@ struct Store_OC8_OW8_Remain<3, Op> { }; template struct Store_OC8_OW8_Remain<2, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); @@ -490,8 +492,8 @@ struct Store_OC8_OW8_Remain<2, Op> { }; template struct Store_OC8_OW8_Remain<1, Op> { - static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, - int ld_dst_oc) { + static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr, + int ld_dst_oc) { op(c[0][0], reinterpret_cast(dst_ptr)); op(c[1][0], reinterpret_cast(dst_ptr + ld_dst_oc)); } @@ -500,14 +502,14 @@ struct Store_OC8_OW8_Remain<1, Op> { /////////// template -inline void store_oc8_ow8_remain_static(T& c, const Op& op, T2 dst_ptr, - int ld_dst_oc) { +__ai void store_oc8_ow8_remain_static(T& c, const Op& op, T2 dst_ptr, + int ld_dst_oc) { Store_OC8_OW8_Remain::impl(c, op, dst_ptr, ld_dst_oc); } ////////////////////////////////////// template -inline void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) { +__ai void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) { if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { #define BAIS_INIT(step) c[step] = vld1q_s32(bias_ptr); UNROLL_CALL_RAW(8, BAIS_INIT); @@ -520,8 +522,8 @@ inline void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) { } template -inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr, - int oc_step) { +__ai void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr, + int oc_step) { if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { #define BAIS_INIT(step) \ c[0][step] = vld1q_s32(bias_ptr); \ @@ -539,28 +541,28 @@ inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr, /////////////////////////init_ocx_ow8//////////////////// -inline float32x4_t neon_vdupq_n(float val) { +__ai float32x4_t neon_vdupq_n(float val) { return vdupq_n_f32(val); } -inline int32x4_t neon_vdupq_n(int val) { +__ai int32x4_t neon_vdupq_n(int val) { return vdupq_n_s32(val); } -inline float32x4_t neon_vld1q(const float* ptr) { +__ai float32x4_t neon_vld1q(const float* ptr) { return vld1q_f32(ptr); } -inline int32x4_t neon_vld1q(const int* ptr) { +__ai int32x4_t neon_vld1q(const int* ptr) { return vld1q_s32(ptr); } template struct InitOcxOw8 { - static void impl(T& c, const T2* bias_ptr, int oc_step); + static __ai void impl(T& c, const T2* bias_ptr, int oc_step); }; template struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> { - static void impl(T& c, const T2*, int) { + static __ai void impl(T& c, const T2*, int) { #define BAIS_INIT(step) \ c[0][step] = neon_vdupq_n(static_cast(0)); \ c[1][step] = neon_vdupq_n(static_cast(0)); @@ -570,7 +572,7 @@ struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> { }; template struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> { - static void impl(T& c, const T2*, int) { + static __ai void impl(T& c, const T2*, int) { #define BAIS_INIT(step) \ c[0][step] = neon_vdupq_n(static_cast(0)); \ c[1][step] = neon_vdupq_n(static_cast(0)); @@ -580,7 +582,7 @@ struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> { }; template struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { - static void impl(T& c, const T2* bias_ptr, int oc_step) { + static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { #define BAIS_INIT(step) \ c[0][step] = neon_vld1q(bias_ptr); \ c[1][step] = neon_vld1q(bias_ptr + oc_step); @@ -590,7 +592,7 @@ struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { }; template struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { - static void impl(T& c, const T2* bias_ptr, int oc_step) { + static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { #define BAIS_INIT(step) \ c[0][step] = neon_vld1q(bias_ptr); \ c[1][step] = neon_vld1q(bias_ptr + oc_step); @@ -600,7 +602,7 @@ struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { }; template struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> { - static void impl(T& c, const T2* bias_ptr, int oc_step) { + static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { constexpr int simd_len = 4; #define BAIS_INIT(step) \ c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \ @@ -611,7 +613,7 @@ struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> { }; template struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> { - static void impl(T& c, const T2* bias_ptr, int oc_step) { + static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { constexpr int simd_len = 4; #define BAIS_INIT(step) \ c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \ @@ -623,7 +625,7 @@ struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> { template struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> { - static void impl(T& c, const T2*, int) { + static __ai void impl(T& c, const T2*, int) { #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast(0)); UNROLL_CALL_RAW(8, BAIS_INIT); #undef BAIS_INIT @@ -631,7 +633,7 @@ struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> { }; template struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> { - static void impl(T& c, const T2*, int) { + static __ai void impl(T& c, const T2*, int) { #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast(0)); UNROLL_CALL_RAW(4, BAIS_INIT); #undef BAIS_INIT @@ -639,7 +641,7 @@ struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> { }; template struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { - static void impl(T& c, const T2* bias_ptr, int) { + static __ai void impl(T& c, const T2* bias_ptr, int) { #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr); UNROLL_CALL_RAW(8, BAIS_INIT); #undef BAIS_INIT @@ -647,7 +649,7 @@ struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { }; template struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { - static void impl(T& c, const T2* bias_ptr, int) { + static __ai void impl(T& c, const T2* bias_ptr, int) { #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr); UNROLL_CALL_RAW(4, BAIS_INIT); #undef BAIS_INIT @@ -655,7 +657,7 @@ struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { }; template struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> { - static void impl(T& c, const T2* bias_ptr, int) { + static __ai void impl(T& c, const T2* bias_ptr, int) { constexpr int simd_len = 4; #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len); UNROLL_CALL_RAW(8, BAIS_INIT); @@ -664,7 +666,7 @@ struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> { }; template struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> { - static void impl(T& c, const T2* bias_ptr, int) { + static __ai void impl(T& c, const T2* bias_ptr, int) { constexpr int simd_len = 4; #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len); UNROLL_CALL_RAW(4, BAIS_INIT); @@ -673,18 +675,18 @@ struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> { }; template -inline void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) { +__ai void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) { InitOcxOw8::impl(c, bias_ptr, oc_step); } /////////////////////init_ocx_ow4///////////////////// template struct InitOcxOw4 { - static void impl(T& c, const int32_t* bias_ptr, int oc_step); + static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step); }; template struct InitOcxOw4<2, bias_mode, T> { - static void impl(T& c, const int32_t* bias_ptr, int oc_step) { + static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step) { if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { #define BAIS_INIT(step) \ c[0][step] = vld1q_s32(bias_ptr); \ @@ -703,7 +705,7 @@ struct InitOcxOw4<2, bias_mode, T> { template struct InitOcxOw4<1, bias_mode, T> { - static void impl(T& c, const int32_t* bias_ptr, int oc_step) { + static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step) { MEGDNN_MARK_USED_VAR(oc_step); if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { #define BAIS_INIT(step) c[0][step] = vld1q_s32(bias_ptr); @@ -718,12 +720,12 @@ struct InitOcxOw4<1, bias_mode, T> { }; template -inline void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) { +__ai void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) { InitOcxOw4::impl(c, bias_ptr, oc_step); } /////////////////////////////////////// } // namespace } // namespace megdnn - +#undef __ai // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/intrinsic_helper.h b/dnn/src/arm_common/intrinsic_helper.h index 145430c22cad8a25c84eef057cb77b201cf4c73f..65ed5aebb358d890fb71438f270a2bf90ad0c69b 100644 --- a/dnn/src/arm_common/intrinsic_helper.h +++ b/dnn/src/arm_common/intrinsic_helper.h @@ -13,13 +13,14 @@ #include "src/arm_common/neon_struct.h" #include "src/arm_common/simd_macro/marm_neon.h" #include "src/common/unroll_macro.h" +#define __ai inline __attribute__((__always_inline__)) namespace megdnn { namespace { template struct LoadHelper { - static void impl(T& weight, T2 ptr, int oc_offset, XT... args); + static __ai void impl(T& weight, T2 ptr, int oc_offset, XT... args); }; #define WEIGHT_CB(step) \ @@ -29,7 +30,7 @@ struct LoadHelper { template \ struct LoadHelper { \ - static void impl(T& src, T2 ptr, int, XT... args) { \ + static __ai void impl(T& src, T2 ptr, int, XT... args) { \ UNROLL_CALL_RAW(step, WEIGHT_CB); \ } \ } @@ -62,7 +63,7 @@ LOAD_HELPER(16); template \ struct LoadHelper { \ - static void impl(T& src, T2 ptr, int) { \ + static __ai void impl(T& src, T2 ptr, int) { \ UNROLL_CALL_RAW(step, WEIGHT_CB); \ } \ } @@ -89,7 +90,7 @@ LOAD_HELPER(9); template \ struct LoadHelper { \ - static void impl(T& src, T2 ptr, int oc_offset) { \ + static __ai void impl(T& src, T2 ptr, int oc_offset) { \ UNROLL_CALL_RAW(step, WEIGHT_CB); \ } \ } @@ -108,19 +109,19 @@ LOAD_HELPER(8); template -inline void load_helper(T& weight, T2 ptr, int oc_offset) { +__ai void load_helper(T& weight, T2 ptr, int oc_offset) { LoadHelper::impl( weight, ptr, oc_offset); } template -inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { +__ai void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { LoadHelper::impl(weight, ptr, oc_offset, args...); } } // namespace } // namespace megdnn - +#undef __ai // vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/arm_common/neon_struct.h b/dnn/src/arm_common/neon_struct.h index 6aaf140999d30bcb46dca003548b8b460e9a78cf..7edc81347ea11d31b731059baeb1a9156439e82d 100644 --- a/dnn/src/arm_common/neon_struct.h +++ b/dnn/src/arm_common/neon_struct.h @@ -11,59 +11,68 @@ */ #pragma once #include "src/arm_common/simd_macro/marm_neon.h" + +#define __ai inline __attribute__((__always_inline__)) namespace megdnn { namespace { struct Vdotq_s32_h { - static int32x4_t impl(int8x16_t& a, int8x16_t& b, int32x4_t& c, - int16x8_t& temp) { + static __ai int32x4_t impl(int8x16_t& a, int8x16_t& b, int32x4_t& c, + int16x8_t& temp) { return vdotq_s32_h(a, b, c, temp); } }; struct Vdot2_s32_h { - static int32x4_t impl(int8x8_t a, int8x8_t b, int32x4_t c, int16x8_t temp) { + static __ai int32x4_t impl(int8x8_t a, int8x8_t b, int32x4_t c, + int16x8_t temp) { return vdot2_s32_h(a, b, c, temp); } }; struct Vmlal_s16 { - static int32x4_t impl(int16x8_t a, int16x8_t b, int32x4_t c) { + static __ai int32x4_t impl(int16x8_t a, int16x8_t b, int32x4_t c) { return vmlal_s16(c, vget_low_s16(a), vget_low_s16(b)); } }; struct Vld1q_s8 { - static int8x16_t impl(const int8_t* ptr) { return vld1q_s8(ptr); } + static __ai int8x16_t impl(const int8_t* ptr) { return vld1q_s8(ptr); } }; struct Vld1q_f32 { - static float32x4_t impl(const float32_t* ptr) { return vld1q_f32(ptr); } + static __ai float32x4_t impl(const float32_t* ptr) { + return vld1q_f32(ptr); + } }; struct Vld1_s8 { - static int8x8_t impl(const int8_t* ptr) { return vld1_s8(ptr); } + static __ai int8x8_t impl(const int8_t* ptr) { return vld1_s8(ptr); } }; struct Vldq_dup_4s8_8s16 { - static int16x8_t impl(const int8_t* ptr) { return vldq_dup_4s8_8s16(ptr); } + static __ai int16x8_t impl(const int8_t* ptr) { + return vldq_dup_4s8_8s16(ptr); + } }; struct Vldq_tbl_low_s8 { - static int8x8_t impl(const int8_t* ptr, uint8x16_t idx) { + static __ai int8x8_t impl(const int8_t* ptr, uint8x16_t idx) { return vldq_tbl_low_s8(ptr, idx); } }; struct Vld1_dup_s8_s16 { - static int16x8_t impl(const int8_t* ptr) { return vld1_dup_s8_s16(ptr); } + static __ai int16x8_t impl(const int8_t* ptr) { + return vld1_dup_s8_s16(ptr); + } }; struct Vfmaq_laneq_f32 { template - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + static __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, lane); } }; #if __ARM_FEATURE_DOTPROD struct Vdotq_laneq_s32 { template - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { + static __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { return vdotq_laneq_s32(a, b, v, lane); } }; @@ -72,4 +81,5 @@ struct Vdotq_laneq_s32 { } // namespace } // namespace megdnn +#undef __ai // vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/arm_common/simd_macro/marm_neon.h b/dnn/src/arm_common/simd_macro/marm_neon.h index 0019b178f2aeeb288fc02d175d1feae2bd512264..8a6cde61e4abc35d4e17d75c6341ef973cf7754c 100644 --- a/dnn/src/arm_common/simd_macro/marm_neon.h +++ b/dnn/src/arm_common/simd_macro/marm_neon.h @@ -20,7 +20,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpragmas" #pragma GCC diagnostic ignored "-Wattributes" -#define __ai static inline __attribute__((__always_inline__, __nodebug__)) +#define __ai \ + static inline \ + __attribute__((__gnu_inline__, __always_inline__, __nodebug__)) #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !MEGDNN_DISABLE_FLOAT16 #define MEGDNN_INC_ARM_FP16(_x) _x @@ -299,16 +301,20 @@ __ai uint32x2_t vdot2_u8(uint8x8_t a, uint8x8_t b) { #endif // __ARM_FEATURE_DOTPROD +#if __GNUC__ < 8 #undef vld1q_f32_x2 __ai float32x4x2_t vld1q_f32_x2(const float* p) { return {{vld1q_f32(p), vld1q_f32(p + 4)}}; } +#endif +#if __GNUC__ < 9 #undef vst1q_f32_x2 __ai void vst1q_f32_x2(const float* p, float32x4x2_t v) { vst1q_f32(const_cast(p), v.val[0]); vst1q_f32(const_cast(p) + 4, v.val[1]); } +#endif __ai int8x16_t vtranslq_s8(int8x8_t a) { int8x16_t ret; @@ -472,18 +478,18 @@ __ai int8x16_t vqtbl1q_s8(int8x16_t& a, uint8x16_t& idx) { namespace { template struct Vdup_laneq_s16_armv7 { - static int16x4_t impl(int16x8_t vec); + __ai int16x4_t impl(int16x8_t vec); }; #define cb(step) \ template <> \ struct Vdup_laneq_s16_armv7 { \ - static int16x4_t impl(int16x8_t vec) { \ + __ai int16x4_t impl(int16x8_t vec) { \ return vdup_lane_s16(vget_high_s16(vec), step); \ } \ }; \ template <> \ struct Vdup_laneq_s16_armv7 { \ - static int16x4_t impl(int16x8_t vec) { \ + __ai int16x4_t impl(int16x8_t vec) { \ return vdup_lane_s16(vget_low_s16(vec), step); \ } \ }; @@ -495,30 +501,30 @@ UNROLL_CALL_RAW(4, cb); namespace { template struct Vfmaq_laneq_f32_armv7 { - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v); + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v); }; template <> struct Vfmaq_laneq_f32_armv7<0> { - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_lane_f32(a, b, vget_low_f32(v), 0); } }; template <> struct Vfmaq_laneq_f32_armv7<1> { - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_lane_f32(a, b, vget_low_f32(v), 1); } }; template <> struct Vfmaq_laneq_f32_armv7<2> { - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_lane_f32(a, b, vget_high_f32(v), 0); } }; template <> struct Vfmaq_laneq_f32_armv7<3> { - static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_lane_f32(a, b, vget_high_f32(v), 1); } }; @@ -527,37 +533,98 @@ struct Vfmaq_laneq_f32_armv7<3> { Vfmaq_laneq_f32_armv7::impl(a, b, v) #if __ARM_FEATURE_DOTPROD +namespace { template struct Vdotq_laneq_s32_armv7 { - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v); + __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v); }; template <> struct Vdotq_laneq_s32_armv7<0> { - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { + __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { return vdotq_lane_s32(a, b, vget_low_s32(v), 0); } }; template <> struct Vdotq_laneq_s32_armv7<1> { - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { + __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { return vdotq_lane_s32(a, b, vget_low_s32(v), 1); } }; template <> struct Vdotq_laneq_s32_armv7<2> { - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { + __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { return vdotq_lane_s32(a, b, vget_high_s32(v), 0); } }; template <> struct Vdotq_laneq_s32_armv7<3> { - static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { + __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) { return vdotq_lane_s32(a, b, vget_high_f32(v), 1); } }; #define vdotq_laneq_s32(a, b, v, lane) \ Vdotq_laneq_s32_armv7::impl(a, b, v) +} // namespace +#endif + +#endif + +//! GCC split fmla with lane to dup+fmla when version < 9 +//! https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 +#if !defined(__clang__) && __GNUC__ < 9 +#if MEGDNN_AARCH64 +namespace { + +template +struct Vfmaq_laneq_f32_armv8 { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v); +}; +template <> +struct Vfmaq_laneq_f32_armv8<0> { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + asm volatile("fmla %0.4s, %1.4s, %2.s[0]\n" + : "+w"(a) + : "w"(b), "w"(v) + :); + return a; + } +}; +template <> +struct Vfmaq_laneq_f32_armv8<1> { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + asm volatile("fmla %0.4s, %1.4s, %2.s[1]\n" + : "+w"(a) + : "w"(b), "w"(v) + :); + return a; + } +}; +template <> +struct Vfmaq_laneq_f32_armv8<2> { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + asm volatile("fmla %0.4s, %1.4s, %2.s[2]\n" + : "+w"(a) + : "w"(b), "w"(v) + :); + return a; + } +}; +template <> +struct Vfmaq_laneq_f32_armv8<3> { + __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) { + asm volatile("fmla %0.4s, %1.4s, %2.s[3]\n" + : "+w"(a) + : "w"(b), "w"(v) + :); + return a; + } +}; +} // namespace +#undef vfmaq_laneq_f32 +#define vfmaq_laneq_f32(a, b, v, lane) \ + Vfmaq_laneq_f32_armv8::impl(a, b, v) + #endif #endif diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp index ef19dea7e034ac1226c18ab9f86927321358474e..d9f7bd72d92e2407999afd4973965e655c9bb7ef 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp @@ -77,7 +77,7 @@ std::vector get_nchw44_conv_bias_args( bool only_no_bias = false) { using namespace conv_bias; using NLMode = param::ConvBias::NonlineMode; - + std::vector args; auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, @@ -172,11 +172,11 @@ std::vector get_nchw44_conv_bias_args( bias_mode.emplace_back(megdnn::BiasMode::NO_BIAS); } if (support_full_bias) { - bias_mode.emplace_back(megdnn::BiasMode::BIAS); + bias_mode.emplace_back(megdnn::BiasMode::BIAS); } for (auto bias : bias_mode) for (auto nlmode : nonlinemode) - for (size_t n : {1,2}) + for (size_t n : {1, 2}) for (size_t kernel : kernel_vec) for (size_t oc : {4, 12}) for (size_t ic : {1, 3, 4, 12}) @@ -364,8 +364,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) { } TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S1_K7) { - check_conv_bias(get_nchw44_conv_bias_args({7}, 1, false, true, true, - false, false, false), + check_conv_bias(get_nchw44_conv_bias_args({7}, 1, false, true, true, false, + false, false), handle(), "F32_CONV_NCHW44_DIRECT"); } @@ -403,10 +403,12 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2_SMALL_GROUP) { check_conv_bias(get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false), handle(), "F32STRD2_SMALL_GROUP"); } -TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32) { +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32_S2) { check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, false, true), handle(), "F32_CONV_NCHW_NCHW44"); +} +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32_S1) { check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false, true), handle(), "F32_CONV_NCHW_NCHW44"); @@ -566,13 +568,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_QS8_CHANNEL_WISE_DIRECT2_NCHW44) { handle(), "S8_CHAN_WISE_STRD2_NCHW44"); } -TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44) { +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44_S1) { checker_conv_bias_qint8x8x8( - get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, false, + get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false, true), handle(), "S8_CONV_NCHW_NCHW44"); +} +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44_S2) { checker_conv_bias_qint8x8x8( - get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false, + get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, false, true), handle(), "S8_CONV_NCHW_NCHW44"); } @@ -1820,7 +1824,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args( - {2, 4, 7}, 1, false, false, false, false, false, true,true); + {2, 4, 7}, 1, false, false, false, false, false, true, true); #if MEGDNN_AARCH64 check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); #elif MEGDNN_ARMV7 @@ -1841,7 +1845,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) { TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args( - {3}, 2, false, false, false, false, false, true, true,false); + {3}, 2, false, false, false, false, false, true, true, false); #if MEGDNN_AARCH64 check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); #elif MEGDNN_ARMV7 diff --git a/toolchains/aarch64-none-linux-gnu.toolchain.cmake b/toolchains/aarch64-none-linux-gnu.toolchain.cmake new file mode 100644 index 0000000000000000000000000000000000000000..df9c1ab4c2b21353104a1c6d6b64bc10eaf6108c --- /dev/null +++ b/toolchains/aarch64-none-linux-gnu.toolchain.cmake @@ -0,0 +1,6 @@ +set(ARM_CROSS_BUILD_ARCH aarch64) +set(CMAKE_C_COMPILER "aarch64-none-linux-gnu-gcc") +set(CMAKE_CXX_COMPILER "aarch64-none-linux-gnu-g++") +set(CMAKE_C_FLAGS "-Werror=unused-parameter -Wno-psabi") +set(CMAKE_CXX_FLAGS "-Werror=unused-parameter -Wno-psabi") +set(CMAKE_STRIP "aarch64-none-linux-gnu-strip")