diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7405ee25055c37a67762291dc6409c6a44214385..f773219646884b09bf1202ad83c0c7590364cb03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,8 +128,11 @@ else()
     set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
     if(ANDROID)
         set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -DNDEBUG")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -DNDEBUG -g")
+        
     else()
         set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -DNDEBUG -g")
     endif()
 endif()
 
diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h
index bc912d64298fa934e0fefa816974822b0de324ac..0eef23510f8be17fef9344bb0147b266c400eb76 100644
--- a/dnn/include/megdnn/arch.h
+++ b/dnn/include/megdnn/arch.h
@@ -29,6 +29,13 @@
  #define megdnn_likely(v) __builtin_expect(bool(v), 1)
  #define megdnn_unlikely(v) __builtin_expect(bool(v), 0)
 
+#if !defined(__clang__) && MEGDNN_ARMV7 && !defined(NDEBUG)
+//! Thumb2 limit code length
+#define MEGDNN_ALWAYS_INLINE
+#else
+#define MEGDNN_ALWAYS_INLINE inline __attribute__((__always_inline__))
+#endif
+
  #define MEGDNN_DEPRECATED __attribute__((deprecated))
  #define MEGDNN_PACKED __attribute__((packed))
  #define MEGDNN_CONSTEXPR constexpr
diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h
index 205855f55560193fd8c7c5c8871fed96bea5bdce..70b7a1de3e7384f22107c67b7478e426b6320912 100644
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_kern.h
@@ -10,6 +10,7 @@
  * implied.
  */
 #pragma once
+#include "megdnn/arch.h"
 #include "src/arm_common/conv_bias/intrinsic_helper.h"
 #include "src/arm_common/conv_bias/opr_impl.h"
 #include "src/arm_common/elemwise_op.h"
@@ -17,7 +18,6 @@
 #include "src/common/unroll_macro.h"
 #include "src/common/utils.h"
 #include "src/fallback/conv_bias/common.h"
-
 namespace megdnn {
 namespace arm_common {
 namespace {
@@ -32,13 +32,13 @@ namespace {
 template <int src_idx, int weight_idx, int c_dim, typename Func, int stride,
           typename T, typename T2, typename T3>
 struct ShiftCalHelper {
-    static void impl(T& c, T2& src, T3& weight);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };
 
 template <int src_idx, int weight_idx, typename Func, int stride, typename T,
           typename T2, typename T3>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, stride, T, T2, T3> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step)                                                     \
     c[0][step] = Func::template impl<(step * stride + src_idx) % 4>( \
             c[0][step], weight[0][weight_idx],                       \
@@ -54,7 +54,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, stride, T, T2, T3> {
 template <int src_idx, int weight_idx, typename Func, int stride, typename T,
           typename T2, typename T3>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, stride, T, T2, T3> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step)                                                     \
     c[0][step] = Func::template impl<(step * stride + src_idx) % 4>( \
             c[0][step], weight[0][weight_idx],                       \
@@ -67,7 +67,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, stride, T, T2, T3> {
 
 template <int src_idx, int weight_idx, int c_dim, typename FUNC, int stride,
           typename T, typename T2, typename T3>
-inline void cal_helper(T& c, T2& src, T3& weight) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
     ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, stride, T, T2, T3>::impl(
             c, src, weight);
 };
diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp
index 9daf67a1ce6a1c1838da010ba777bf1c9fa2e818..ba8af1efc0fd0341a42e82d0f07f712958644da3 100644
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp
@@ -11,6 +11,7 @@
  * implied.
  */
 
+#include "megdnn/arch.h"
 #include "src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h"
 #include "src/arm_common/conv_bias/intrinsic_helper.h"
 #include "src/arm_common/elemwise_op.h"
@@ -26,13 +27,13 @@ namespace {
 template <int src_idx, int weight_idx, int c_dim, typename Func, int ow_block,
           typename T, typename T2, typename T3, typename T4>
 struct ShiftCalHelper {
-    static void impl(T& c, T2& src, T3& weight);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };
 
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 8]);  \
@@ -49,7 +50,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 4]);  \
@@ -66,7 +67,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 8]);
@@ -81,7 +82,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 4]);
@@ -96,7 +97,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> {
 
 template <int src_idx, int weight_idx, int c_dim, typename FUNC, int ow_block,
           typename T, typename T2, typename T3>
-inline void cal_helper(T& c, T2& src, T3& weight) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
     ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, ow_block, T, T2, T3,
                    int>::impl(c, src, weight);
 };
diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
index 2b2b73d8c11ec737122ebc47929f30b82143ed76..78019dd6b8c391499cbbc871137c436ae64e0be8 100644
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
@@ -11,6 +11,7 @@
  * implied.
  */
 
+#include "megdnn/arch.h"
 #include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h"
 #include "src/arm_common/conv_bias/intrinsic_helper.h"
 #include "src/arm_common/elemwise_op.h"
@@ -26,13 +27,13 @@ namespace {
 template <int src_idx, int weight_idx, int c_dim, typename Func, int ow_block,
           typename T, typename T2, typename T3, typename T4>
 struct ShiftCalHelper {
-    static void impl(T& c, T2& src, T3& weight);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };
 
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 8]);  \
@@ -49,7 +50,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 4]);  \
@@ -66,7 +67,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 8]);
@@ -81,7 +82,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                                  \
     c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \
                                            src[(step + src_idx) % 4]);
@@ -96,7 +97,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> {
 
 template <int src_idx, int weight_idx, int c_dim, typename FUNC, int ow_block,
           typename T, typename T2, typename T3>
-inline void cal_helper(T& c, T2& src, T3& weight) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
     ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, ow_block, T, T2, T3,
                    int>::impl(c, src, weight);
 };
@@ -462,9 +463,10 @@ inline void odd_even_split_iw8_even(float* sptr_base, const float* sptr,
     vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[5]);
     vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[7]);
 }
-void odd_even_split_iw8_odd(float* sptr_base, const float* sptr,
-                            const int odd_start, const int src_idx,
-                            const int iw_idx) {
+
+inline void odd_even_split_iw8_odd(float* sptr_base, const float* sptr,
+                                   const int odd_start, const int src_idx,
+                                   const int iw_idx) {
     constexpr int ic_step = 4;
     const int src_offset = src_idx * ic_step;
     const int even_offset = (iw_idx + 1) / 2 * ic_step;
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h
index 4b57156f950b7fec5fedb00008686e93c0756072..3d9e9a0865ee55062688738555e8538025ddec52 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h
+++ b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h
@@ -5,11 +5,13 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
-
+#pragma once
 #ifdef __ARM_FEATURE_DOTPROD
 
+#include "megdnn/arch.h"
 #include "src/arm_common/conv_bias/intrinsic_helper.h"
 #include "src/arm_common/elemwise_op.h"
 #include "src/arm_common/intrinsic_helper.h"
@@ -27,8 +29,8 @@ constexpr int filter_next_col =
         IC_PACK_SIZE * OC_PACK_SIZE;  //! [OC/4, IC/4, FH, FW, 4OC, 4IC]
 
 template <int row, BiasMode bias_mode>
-inline void init_ocx_ow8(int32x4_t c[][8], const int32_t* bias_ptr,
-                         int oc_step) {
+MEGDNN_ALWAYS_INLINE void init_ocx_ow8(int32x4_t c[][8],
+                                       const int32_t* bias_ptr, int oc_step) {
     static_assert(row == 1 || row == 2 || row == 3, "Invalid OC number.");
     if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
 #define BIAS_INIT(step, i) c[i][step] = vld1q_s32(bias_ptr + i * oc_step);
@@ -90,12 +92,13 @@ inline void init_ocx_ow8(int32x4_t c[][8], const int32_t* bias_ptr,
 
 template <int row, int ow_remain, typename Op, typename T>
 struct StoreOCxOWx {
-    static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr,
-                     const int ld_dst_oc);
+    static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op,
+                                          T* dst_ptr, const int ld_dst_oc);
 };
 
 template <int ow_remain, typename Op, typename T>
 struct StoreOCxOWx<1, ow_remain, Op, T> {
+
     static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr,
                      const int ld_dst_oc) {
         MEGDNN_MARK_USED_VAR(ld_dst_oc);
@@ -128,8 +131,8 @@ struct StoreOCxOWx<1, ow_remain, Op, T> {
 
 template <int ow_remain, typename Op, typename T>
 struct StoreOCxOWx<2, ow_remain, Op, T> {
-    static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr,
-                     const int ld_dst_oc) {
+    static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op,
+                                          T* dst_ptr, const int ld_dst_oc) {
         switch (ow_remain) {
             case 8:
                 UNROLL_CALL_RAW(4, cb22);
@@ -159,8 +162,8 @@ struct StoreOCxOWx<2, ow_remain, Op, T> {
 
 template <int ow_remain, typename Op, typename T>
 struct StoreOCxOWx<3, ow_remain, Op, T> {
-    static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr,
-                     const int ld_dst_oc) {
+    static MEGDNN_ALWAYS_INLINE void impl(int32x4_t res[][8], const Op& op,
+                                          T* dst_ptr, const int ld_dst_oc) {
         switch (ow_remain) {
             case 8:
                 UNROLL_CALL_RAW(4, cb32);
@@ -196,15 +199,16 @@ struct StoreOCxOWx<3, ow_remain, Op, T> {
 #undef cb32
 
 template <int row, int ow_remain, typename Op, typename T>
-inline void store_ocx_owx_remain_static(int32x4_t res[][8], const Op& op,
-                                        T* dst_ptr, const int ld_dst_oc) {
+MEGDNN_ALWAYS_INLINE void store_ocx_owx_remain_static(int32x4_t res[][8],
+                                                      const Op& op, T* dst_ptr,
+                                                      const int ld_dst_oc) {
     StoreOCxOWx<row, ow_remain, Op, T>::impl(res, op, dst_ptr, ld_dst_oc);
 }
 
 template <int res_row, int src_row, int src_start_idx, int weight_idx,
           typename FUNC, typename T, typename T2, typename T3>
 struct ShiftCalHelper {
-    static void impl(T& res, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& res, T2& src, T3& weight) {
 #define cb(step)                                                            \
     res[res_row][step] = FUNC::template impl<((src_start_idx + step) % 4)>( \
             res[res_row][step], weight[weight_idx],                         \
@@ -216,7 +220,7 @@ struct ShiftCalHelper {
 
 template <int res_row, int src_row, int src_start_idx, int weight_idx,
           typename FUNC, typename T, typename T2, typename T3>
-inline void cal_helper(T& res, T2& src, T3& weight) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& res, T2& src, T3& weight) {
     ShiftCalHelper<res_row, src_row, src_start_idx, weight_idx, FUNC, T, T2,
                    T3>::impl(res, src, weight);
 };
@@ -428,4 +432,4 @@ struct KernNeonSdotNCHW44<dst_type, 2, bias_mode, Op, ow_remain, filter_size,
 
 #endif
 
-//vim: syntax=cpp.doxygen
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h b/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h
index bc2b2c1a2432531a1e36f2e0953bf3c0acd9e1d7..db6bbf65e69963693c2581fa4b4229c06892670b 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h
+++ b/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h
@@ -10,6 +10,7 @@
  * implied.
  */
 #pragma once
+#include "megdnn/arch.h"
 #include "src/arm_common/conv_bias/intrinsic_helper.h"
 #include "src/arm_common/conv_bias/opr_impl.h"
 #include "src/arm_common/elemwise_op.h"
@@ -37,13 +38,13 @@ namespace {
 template <int src_idx, int weight_idx, int c_dim, typename Func, int stride,
           typename T, typename T2, typename T3, typename T4>
 struct ShiftCalHelper {
-    static void impl(T& c, T2& src, T3& weight, T4& temp);
-    static void impl(T& c, T2& src, T3& weight);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 2, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight, T4& temp) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
         c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0],
                              temp[0]);
         c[1][0] = Func::impl(src[0 + src_idx], weight[1][weight_idx], c[1][0],
@@ -61,7 +62,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 2, T, T2, T3, T4> {
         c[1][3] = Func::impl(src[3 + src_idx], weight[1][weight_idx], c[1][3],
                              temp[3]);
     }
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
         c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0]);
         c[1][0] = Func::impl(src[0 + src_idx], weight[1][weight_idx], c[1][0]);
         c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1]);
@@ -75,7 +76,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 2, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 2, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight, T4& temp) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
         c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0],
                              temp[0]);
         c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1],
@@ -85,7 +86,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 2, T, T2, T3, T4> {
         c[0][3] = Func::impl(src[3 + src_idx], weight[0][weight_idx], c[0][3],
                              temp[2]);
     }
-    static void impl(T& c, T2& src, T3& weight) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
         c[0][0] = Func::impl(src[0 + src_idx], weight[0][weight_idx], c[0][0]);
         c[0][1] = Func::impl(src[1 + src_idx], weight[0][weight_idx], c[0][1]);
         c[0][2] = Func::impl(src[2 + src_idx], weight[0][weight_idx], c[0][2]);
@@ -96,7 +97,7 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 2, T, T2, T3, T4> {
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 1, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight, T4& temp) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
         c[0][0] = Func::impl(src[(0 + src_idx) % 8], weight[0][weight_idx],
                              c[0][0], temp[0]);
         c[1][0] = Func::impl(src[(0 + src_idx) % 8], weight[1][weight_idx],
@@ -131,12 +132,12 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 1, T, T2, T3, T4> {
         c[1][7] = Func::impl(src[(7 + src_idx) % 8], weight[1][weight_idx],
                              c[1][7], temp[3]);
     }
-    static void impl(T&, T2&, T3&);
+    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
 };
 template <int src_idx, int weight_idx, typename Func, typename T, typename T2,
           typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 1, T, T2, T3, T4> {
-    static void impl(T& c, T2& src, T3& weight, T4& temp) {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
         c[0][0] = Func::impl(src[(0 + src_idx) % 8], weight[0][weight_idx],
                              c[0][0], temp[0]);
         c[0][1] = Func::impl(src[(1 + src_idx) % 8], weight[0][weight_idx],
@@ -154,18 +155,18 @@ struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 1, T, T2, T3, T4> {
         c[0][7] = Func::impl(src[(7 + src_idx) % 8], weight[0][weight_idx],
                              c[0][7], temp[3]);
     }
-    static void impl(T&, T2&, T3&);
+    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
 };
 
 template <int src_idx, int weight_idx, int c_dim, typename FUNC, int stride,
           typename T, typename T2, typename T3, typename T4>
-inline void cal_helper(T& c, T2& src, T3& weight, T4& temp) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) {
     ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, stride, T, T2, T3,
                    T4>::impl(c, src, weight, temp);
 }
 template <int src_idx, int weight_idx, int c_dim, typename FUNC, int stride,
           typename T, typename T2, typename T3>
-inline void cal_helper(T& c, T2& src, T3& weight) {
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
     ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, stride, T, T2, T3,
                    int>::impl(c, src, weight);
 };
@@ -703,8 +704,9 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> {
 
 enum PACK_MODE { NO_PAD = 0, FIRST_PAD = 1, LAST_PAD = 2 };
 template <PACK_MODE mode>
-inline void pack_src_one_line(const int8_t* inptr, int8_t* outptr, int left_pad,
-                              int right_pad, const int iw) {
+MEGDNN_ALWAYS_INLINE void pack_src_one_line(const int8_t* inptr, int8_t* outptr,
+                                            int left_pad, int right_pad,
+                                            const int iw) {
     const int8_t* src_row_0 = inptr;
     const int8_t* src_row_1 = inptr + iw;
     constexpr int combine_row = 2;
@@ -1235,6 +1237,7 @@ struct ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, 1> {
                 }
             }
         }
+
         if (oc_remain > 0) {
             size_t oc_idx = oc_end;
             const size_t weight_offset = oc_idx * ic * fh * fw;
@@ -1284,4 +1287,5 @@ static void conv_direct_int8_nchw_nchw44(const int8_t* src,
 }  // namespace
 }  // namespace arm_common
 }  // namespace megdnn
-   // vim: syntax=cpp.doxygen
\ No newline at end of file
+
+// vim: syntax=cpp.doxygen
\ No newline at end of file
diff --git a/dnn/src/arm_common/conv_bias/intrinsic_helper.h b/dnn/src/arm_common/conv_bias/intrinsic_helper.h
index 67e2d3f7747094763635a02037dc1e77969cd346..e27f8e2946b3b08de8201d11a76bff9398bec828 100644
--- a/dnn/src/arm_common/conv_bias/intrinsic_helper.h
+++ b/dnn/src/arm_common/conv_bias/intrinsic_helper.h
@@ -15,18 +15,20 @@
 #include "src/arm_common/simd_macro/marm_neon.h"
 #include "src/common/unroll_macro.h"
 #include "src/fallback/conv_bias/common.h"
+
+#define __ai inline __attribute__((__always_inline__))
 namespace megdnn {
 namespace {
 
 ////////////////////Store_OC4_OW8_Remain/////////////////////////
 template <int ow_remain, typename Op>
 struct Store_OC4_OW8_Remain {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr);
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr);
 };
 
 template <typename Op>
 struct Store_OC4_OW8_Remain<0, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[2], c[3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[4], c[5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -36,7 +38,7 @@ struct Store_OC4_OW8_Remain<0, Op> {
 
 template <typename Op>
 struct Store_OC4_OW8_Remain<7, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[2], c[3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[4], c[5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -45,7 +47,7 @@ struct Store_OC4_OW8_Remain<7, Op> {
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<6, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[2], c[3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[4], c[5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -53,7 +55,7 @@ struct Store_OC4_OW8_Remain<6, Op> {
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<5, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[2], c[3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op(c[4], reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -61,46 +63,46 @@ struct Store_OC4_OW8_Remain<5, Op> {
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<4, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[2], c[3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
     }
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<3, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[2], reinterpret_cast<dt_qint8*>(dst_ptr + 8));
     }
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<2, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op({{c[0], c[1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
     }
 };
 template <typename Op>
 struct Store_OC4_OW8_Remain<1, Op> {
-    static void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
+    static __ai void impl(int32x4_t c[8], const Op& op, int8_t* dst_ptr) {
         op(c[0], reinterpret_cast<dt_qint8*>(dst_ptr));
     }
 };
 
 template <int ow_remain, typename Op>
-inline void store_oc4_ow8_remain_static(int32x4_t c[8], const Op& op,
-                                        int8_t* dst_ptr) {
+__ai void store_oc4_ow8_remain_static(int32x4_t c[8], const Op& op,
+                                      int8_t* dst_ptr) {
     Store_OC4_OW8_Remain<ow_remain, Op>::impl(c, op, dst_ptr);
 }
 
 template <int c_dim, int ow_remain, typename Op, typename T>
 struct StoreOcxOw4Remain {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc);
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc);
 };
 
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<2, 0, Op, T> {
-    static void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
 
@@ -113,7 +115,7 @@ struct StoreOcxOw4Remain<2, 0, Op, T> {
 
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<2, 3, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[0][2], reinterpret_cast<dt_qint8*>(dst_ptr + 8));
 
@@ -124,7 +126,7 @@ struct StoreOcxOw4Remain<2, 3, Op, T> {
 };
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<2, 2, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[1][0], c[1][1]}},
            reinterpret_cast<dt_qint8*>(dst_ptr + ld_dst_oc));
@@ -132,7 +134,7 @@ struct StoreOcxOw4Remain<2, 2, Op, T> {
 };
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<2, 1, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         op(c[0][0], reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[1][0], reinterpret_cast<dt_qint8*>(dst_ptr + ld_dst_oc));
     }
@@ -140,8 +142,8 @@ struct StoreOcxOw4Remain<2, 1, Op, T> {
 
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<1, 0, Op, T> {
-    static void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][4], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         MEGDNN_MARK_USED_VAR(ld_dst_oc);
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
@@ -150,7 +152,7 @@ struct StoreOcxOw4Remain<1, 0, Op, T> {
 
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<1, 3, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         MEGDNN_MARK_USED_VAR(ld_dst_oc);
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[0][2], reinterpret_cast<dt_qint8*>(dst_ptr + 8));
@@ -158,33 +160,33 @@ struct StoreOcxOw4Remain<1, 3, Op, T> {
 };
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<1, 2, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         MEGDNN_MARK_USED_VAR(ld_dst_oc);
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
     }
 };
 template <typename Op, typename T>
 struct StoreOcxOw4Remain<1, 1, Op, T> {
-    static void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, int8_t* dst_ptr, int ld_dst_oc) {
         MEGDNN_MARK_USED_VAR(ld_dst_oc);
         op(c[0][0], reinterpret_cast<dt_qint8*>(dst_ptr));
     }
 };
 template <int c_dim, int ow_remain, typename Op, typename T>
-inline void store_ocx_ow4_remain_static(T& c, const Op& op, int8_t* dst_ptr,
-                                        int ld_dst_oc) {
+__ai void store_ocx_ow4_remain_static(T& c, const Op& op, int8_t* dst_ptr,
+                                      int ld_dst_oc) {
     StoreOcxOw4Remain<c_dim, ow_remain, Op, T>::impl(c, op, dst_ptr, ld_dst_oc);
 }
 ////////////////////Store_OCX_OW8_Remain/////////////////////////
 template <int c_dim, int ow_remain, typename Op, typename T, typename T2,
           typename T3>
 struct StoreOcxOw8Remain {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc);
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc);
 };
 
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 0, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -200,7 +202,7 @@ struct StoreOcxOw8Remain<2, 0, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 8, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -216,7 +218,7 @@ struct StoreOcxOw8Remain<2, 8, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 7, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -231,7 +233,7 @@ struct StoreOcxOw8Remain<2, 7, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 6, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -244,7 +246,7 @@ struct StoreOcxOw8Remain<2, 6, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 5, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op(c[0][4], reinterpret_cast<T3>(dst_ptr + 16));
@@ -256,7 +258,7 @@ struct StoreOcxOw8Remain<2, 5, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 4, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
 
@@ -266,7 +268,7 @@ struct StoreOcxOw8Remain<2, 4, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 3, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op(c[0][2], reinterpret_cast<T3>(dst_ptr + 8));
 
@@ -276,14 +278,14 @@ struct StoreOcxOw8Remain<2, 3, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 2, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[1][0], c[1][1]}}, reinterpret_cast<T3>(dst_ptr + ld_dst_oc));
     }
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<2, 1, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) {
         op(c[0][0], reinterpret_cast<T3>(dst_ptr));
         op(c[1][0], reinterpret_cast<T3>(dst_ptr + ld_dst_oc));
     }
@@ -291,7 +293,7 @@ struct StoreOcxOw8Remain<2, 1, Op, T, T2, T3> {
 
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 0, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -300,7 +302,7 @@ struct StoreOcxOw8Remain<1, 0, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 8, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -310,7 +312,7 @@ struct StoreOcxOw8Remain<1, 8, Op, T, T2, T3> {
 
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 7, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -319,7 +321,7 @@ struct StoreOcxOw8Remain<1, 7, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 6, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<T3>(dst_ptr + 16));
@@ -327,7 +329,7 @@ struct StoreOcxOw8Remain<1, 6, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 5, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
         op(c[0][4], reinterpret_cast<T3>(dst_ptr + 16));
@@ -335,41 +337,41 @@ struct StoreOcxOw8Remain<1, 5, Op, T, T2, T3> {
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 4, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<T3>(dst_ptr + 8));
     }
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 3, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
         op(c[0][2], reinterpret_cast<T3>(dst_ptr + 8));
     }
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 2, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<T3>(dst_ptr));
     }
 };
 template <typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain<1, 1, Op, T, T2, T3> {
-    static void impl(T& c, const Op& op, T2 dst_ptr, int) {
+    static __ai void impl(T& c, const Op& op, T2 dst_ptr, int) {
         op(c[0][0], reinterpret_cast<T3>(dst_ptr));
     }
 };
 
 template <int c_dim, int ow_remain, typename Op, typename T, typename T2>
-inline void store_ocx_ow8_remain_static(T& c, const Op& op, T2 dst_ptr,
-                                        int ld_dst_oc) {
+__ai void store_ocx_ow8_remain_static(T& c, const Op& op, T2 dst_ptr,
+                                      int ld_dst_oc) {
     StoreOcxOw8Remain<c_dim, ow_remain, Op, T, T2, T2>::impl(c, op, dst_ptr,
                                                              ld_dst_oc);
 }
 template <int c_dim, int ow_remain, typename Op, typename T3, typename T,
           typename T2>
-inline void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr,
-                                           int ld_dst_oc) {
+__ai void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr,
+                                         int ld_dst_oc) {
     StoreOcxOw8Remain<c_dim, ow_remain, Op, T, T2, T3>::impl(c, op, dst_ptr,
                                                              ld_dst_oc);
 }
@@ -377,14 +379,14 @@ inline void store_ocx_ow8_remain_static_dt(T& c, const Op& op, T2 dst_ptr,
 
 template <int ow_remain, typename Op>
 struct Store_OC8_OW8_Remain {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc);
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc);
 };
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<0, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -403,8 +405,8 @@ struct Store_OC8_OW8_Remain<0, Op> {
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<7, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -422,8 +424,8 @@ struct Store_OC8_OW8_Remain<7, Op> {
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<6, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op({{c[0][4], c[0][5]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -439,8 +441,8 @@ struct Store_OC8_OW8_Remain<6, Op> {
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<5, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
         op(c[0][4], reinterpret_cast<dt_qint8*>(dst_ptr + 16));
@@ -455,8 +457,8 @@ struct Store_OC8_OW8_Remain<5, Op> {
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<4, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[0][2], c[0][3]}}, reinterpret_cast<dt_qint8*>(dst_ptr + 8));
 
@@ -469,8 +471,8 @@ struct Store_OC8_OW8_Remain<4, Op> {
 
 template <typename Op>
 struct Store_OC8_OW8_Remain<3, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[0][2], reinterpret_cast<dt_qint8*>(dst_ptr + 8));
 
@@ -481,8 +483,8 @@ struct Store_OC8_OW8_Remain<3, Op> {
 };
 template <typename Op>
 struct Store_OC8_OW8_Remain<2, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op({{c[0][0], c[0][1]}}, reinterpret_cast<dt_qint8*>(dst_ptr));
         op({{c[1][0], c[1][1]}},
            reinterpret_cast<dt_qint8*>(dst_ptr + ld_dst_oc));
@@ -490,8 +492,8 @@ struct Store_OC8_OW8_Remain<2, Op> {
 };
 template <typename Op>
 struct Store_OC8_OW8_Remain<1, Op> {
-    static void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
-                     int ld_dst_oc) {
+    static __ai void impl(int32x4_t c[2][8], const Op& op, int8_t* dst_ptr,
+                          int ld_dst_oc) {
         op(c[0][0], reinterpret_cast<dt_qint8*>(dst_ptr));
         op(c[1][0], reinterpret_cast<dt_qint8*>(dst_ptr + ld_dst_oc));
     }
@@ -500,14 +502,14 @@ struct Store_OC8_OW8_Remain<1, Op> {
 ///////////
 
 template <int ow_remain, typename Op, typename T, typename T2>
-inline void store_oc8_ow8_remain_static(T& c, const Op& op, T2 dst_ptr,
-                                        int ld_dst_oc) {
+__ai void store_oc8_ow8_remain_static(T& c, const Op& op, T2 dst_ptr,
+                                      int ld_dst_oc) {
     Store_OC8_OW8_Remain<ow_remain, Op>::impl(c, op, dst_ptr, ld_dst_oc);
 }
 
 //////////////////////////////////////
 template <BiasMode bias_mode>
-inline void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) {
+__ai void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) {
     if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
 #define BAIS_INIT(step) c[step] = vld1q_s32(bias_ptr);
         UNROLL_CALL_RAW(8, BAIS_INIT);
@@ -520,8 +522,8 @@ inline void init_oc4_ow8(int32x4_t c[8], const int32_t* bias_ptr) {
 }
 
 template <BiasMode bias_mode>
-inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr,
-                         int oc_step) {
+__ai void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr,
+                       int oc_step) {
     if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
 #define BAIS_INIT(step)               \
     c[0][step] = vld1q_s32(bias_ptr); \
@@ -539,28 +541,28 @@ inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr,
 
 /////////////////////////init_ocx_ow8////////////////////
 
-inline float32x4_t neon_vdupq_n(float val) {
+__ai float32x4_t neon_vdupq_n(float val) {
     return vdupq_n_f32(val);
 }
 
-inline int32x4_t neon_vdupq_n(int val) {
+__ai int32x4_t neon_vdupq_n(int val) {
     return vdupq_n_s32(val);
 }
-inline float32x4_t neon_vld1q(const float* ptr) {
+__ai float32x4_t neon_vld1q(const float* ptr) {
     return vld1q_f32(ptr);
 }
 
-inline int32x4_t neon_vld1q(const int* ptr) {
+__ai int32x4_t neon_vld1q(const int* ptr) {
     return vld1q_s32(ptr);
 }
 
 template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
 struct InitOcxOw8 {
-    static void impl(T& c, const T2* bias_ptr, int oc_step);
+    static __ai void impl(T& c, const T2* bias_ptr, int oc_step);
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> {
-    static void impl(T& c, const T2*, int) {
+    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step)                            \
     c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
     c[1][step] = neon_vdupq_n(static_cast<T2>(0));
@@ -570,7 +572,7 @@ struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> {
-    static void impl(T& c, const T2*, int) {
+    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step)                            \
     c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
     c[1][step] = neon_vdupq_n(static_cast<T2>(0));
@@ -580,7 +582,7 @@ struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
 #define BAIS_INIT(step)                \
     c[0][step] = neon_vld1q(bias_ptr); \
     c[1][step] = neon_vld1q(bias_ptr + oc_step);
@@ -590,7 +592,7 @@ struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
 #define BAIS_INIT(step)                \
     c[0][step] = neon_vld1q(bias_ptr); \
     c[1][step] = neon_vld1q(bias_ptr + oc_step);
@@ -600,7 +602,7 @@ struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
         constexpr int simd_len = 4;
 #define BAIS_INIT(step)                                  \
     c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
@@ -611,7 +613,7 @@ struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
         constexpr int simd_len = 4;
 #define BAIS_INIT(step)                                  \
     c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
@@ -623,7 +625,7 @@ struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> {
 
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> {
-    static void impl(T& c, const T2*, int) {
+    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
         UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
@@ -631,7 +633,7 @@ struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> {
-    static void impl(T& c, const T2*, int) {
+    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
         UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
@@ -639,7 +641,7 @@ struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int) {
+    static __ai void impl(T& c, const T2* bias_ptr, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
         UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
@@ -647,7 +649,7 @@ struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int) {
+    static __ai void impl(T& c, const T2* bias_ptr, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
         UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
@@ -655,7 +657,7 @@ struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int) {
+    static __ai void impl(T& c, const T2* bias_ptr, int) {
         constexpr int simd_len = 4;
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
         UNROLL_CALL_RAW(8, BAIS_INIT);
@@ -664,7 +666,7 @@ struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> {
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> {
-    static void impl(T& c, const T2* bias_ptr, int) {
+    static __ai void impl(T& c, const T2* bias_ptr, int) {
         constexpr int simd_len = 4;
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
         UNROLL_CALL_RAW(4, BAIS_INIT);
@@ -673,18 +675,18 @@ struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> {
 };
 
 template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
-inline void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) {
+__ai void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) {
     InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step);
 }
 /////////////////////init_ocx_ow4/////////////////////
 template <int c_dim, BiasMode bias_mode, typename T>
 struct InitOcxOw4 {
-    static void impl(T& c, const int32_t* bias_ptr, int oc_step);
+    static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step);
 };
 
 template <BiasMode bias_mode, typename T>
 struct InitOcxOw4<2, bias_mode, T> {
-    static void impl(T& c, const int32_t* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step) {
         if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
 #define BAIS_INIT(step)               \
     c[0][step] = vld1q_s32(bias_ptr); \
@@ -703,7 +705,7 @@ struct InitOcxOw4<2, bias_mode, T> {
 
 template <BiasMode bias_mode, typename T>
 struct InitOcxOw4<1, bias_mode, T> {
-    static void impl(T& c, const int32_t* bias_ptr, int oc_step) {
+    static __ai void impl(T& c, const int32_t* bias_ptr, int oc_step) {
         MEGDNN_MARK_USED_VAR(oc_step);
         if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
 #define BAIS_INIT(step) c[0][step] = vld1q_s32(bias_ptr);
@@ -718,12 +720,12 @@ struct InitOcxOw4<1, bias_mode, T> {
 };
 
 template <int c_dim, BiasMode bias_mode, typename T>
-inline void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) {
+__ai void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) {
     InitOcxOw4<c_dim, bias_mode, T>::impl(c, bias_ptr, oc_step);
 }
 ///////////////////////////////////////
 
 }  // namespace
 }  // namespace megdnn
-
+#undef __ai
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/intrinsic_helper.h b/dnn/src/arm_common/intrinsic_helper.h
index 145430c22cad8a25c84eef057cb77b201cf4c73f..65ed5aebb358d890fb71438f270a2bf90ad0c69b 100644
--- a/dnn/src/arm_common/intrinsic_helper.h
+++ b/dnn/src/arm_common/intrinsic_helper.h
@@ -13,13 +13,14 @@
 #include "src/arm_common/neon_struct.h"
 #include "src/arm_common/simd_macro/marm_neon.h"
 #include "src/common/unroll_macro.h"
+#define __ai inline __attribute__((__always_inline__))
 namespace megdnn {
 namespace {
 
 template <int weight_number, int base_offset, int ptr_step, int oc_block,
           typename Func, typename T, typename T2, typename... XT>
 struct LoadHelper {
-    static void impl(T& weight, T2 ptr, int oc_offset, XT... args);
+    static __ai void impl(T& weight, T2 ptr, int oc_offset, XT... args);
 };
 
 #define WEIGHT_CB(step) \
@@ -29,7 +30,7 @@ struct LoadHelper {
     template <int base_offset, int ptr_step, typename Func, typename T,     \
               typename T2, typename... XT>                                  \
     struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> { \
-        static void impl(T& src, T2 ptr, int, XT... args) {                 \
+        static __ai void impl(T& src, T2 ptr, int, XT... args) {            \
             UNROLL_CALL_RAW(step, WEIGHT_CB);                               \
         }                                                                   \
     }
@@ -62,7 +63,7 @@ LOAD_HELPER(16);
     template <int base_offset, int ptr_step, typename Func, typename T, \
               typename T2>                                              \
     struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> {    \
-        static void impl(T& src, T2 ptr, int) {                         \
+        static __ai void impl(T& src, T2 ptr, int) {                    \
             UNROLL_CALL_RAW(step, WEIGHT_CB);                           \
         }                                                               \
     }
@@ -89,7 +90,7 @@ LOAD_HELPER(9);
     template <int base_offset, int ptr_step, typename Func, typename T, \
               typename T2>                                              \
     struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> {    \
-        static void impl(T& src, T2 ptr, int oc_offset) {               \
+        static __ai void impl(T& src, T2 ptr, int oc_offset) {          \
             UNROLL_CALL_RAW(step, WEIGHT_CB);                           \
         }                                                               \
     }
@@ -108,19 +109,19 @@ LOAD_HELPER(8);
 
 template <int weight_number, int base_offset, int ptr_step, int c_dim,
           typename Func, typename T, typename T2>
-inline void load_helper(T& weight, T2 ptr, int oc_offset) {
+__ai void load_helper(T& weight, T2 ptr, int oc_offset) {
     LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl(
             weight, ptr, oc_offset);
 }
 
 template <int weight_number, int base_offset, int ptr_step, int c_dim,
           typename Func, typename T, typename T2, typename... XT>
-inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
+__ai void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
     LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2,
                XT...>::impl(weight, ptr, oc_offset, args...);
 }
 
 }  // namespace
 }  // namespace megdnn
-
+#undef __ai
 // vim: syntax=cpp.doxygen
\ No newline at end of file
diff --git a/dnn/src/arm_common/neon_struct.h b/dnn/src/arm_common/neon_struct.h
index 6aaf140999d30bcb46dca003548b8b460e9a78cf..7edc81347ea11d31b731059baeb1a9156439e82d 100644
--- a/dnn/src/arm_common/neon_struct.h
+++ b/dnn/src/arm_common/neon_struct.h
@@ -11,59 +11,68 @@
  */
 #pragma once
 #include "src/arm_common/simd_macro/marm_neon.h"
+
+#define __ai inline __attribute__((__always_inline__))
 namespace megdnn {
 namespace {
 struct Vdotq_s32_h {
-    static int32x4_t impl(int8x16_t& a, int8x16_t& b, int32x4_t& c,
-                          int16x8_t& temp) {
+    static __ai int32x4_t impl(int8x16_t& a, int8x16_t& b, int32x4_t& c,
+                               int16x8_t& temp) {
         return vdotq_s32_h(a, b, c, temp);
     }
 };
 struct Vdot2_s32_h {
-    static int32x4_t impl(int8x8_t a, int8x8_t b, int32x4_t c, int16x8_t temp) {
+    static __ai int32x4_t impl(int8x8_t a, int8x8_t b, int32x4_t c,
+                               int16x8_t temp) {
         return vdot2_s32_h(a, b, c, temp);
     }
 };
 
 struct Vmlal_s16 {
-    static int32x4_t impl(int16x8_t a, int16x8_t b, int32x4_t c) {
+    static __ai int32x4_t impl(int16x8_t a, int16x8_t b, int32x4_t c) {
         return vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
     }
 };
 
 struct Vld1q_s8 {
-    static int8x16_t impl(const int8_t* ptr) { return vld1q_s8(ptr); }
+    static __ai int8x16_t impl(const int8_t* ptr) { return vld1q_s8(ptr); }
 };
 struct Vld1q_f32 {
-    static float32x4_t impl(const float32_t* ptr) { return vld1q_f32(ptr); }
+    static __ai float32x4_t impl(const float32_t* ptr) {
+        return vld1q_f32(ptr);
+    }
 };
 struct Vld1_s8 {
-    static int8x8_t impl(const int8_t* ptr) { return vld1_s8(ptr); }
+    static __ai int8x8_t impl(const int8_t* ptr) { return vld1_s8(ptr); }
 };
 struct Vldq_dup_4s8_8s16 {
-    static int16x8_t impl(const int8_t* ptr) { return vldq_dup_4s8_8s16(ptr); }
+    static __ai int16x8_t impl(const int8_t* ptr) {
+        return vldq_dup_4s8_8s16(ptr);
+    }
 };
 
 struct Vldq_tbl_low_s8 {
-    static int8x8_t impl(const int8_t* ptr, uint8x16_t idx) {
+    static __ai int8x8_t impl(const int8_t* ptr, uint8x16_t idx) {
         return vldq_tbl_low_s8(ptr, idx);
     }
 };
 
 struct Vld1_dup_s8_s16 {
-    static int16x8_t impl(const int8_t* ptr) { return vld1_dup_s8_s16(ptr); }
+    static __ai int16x8_t impl(const int8_t* ptr) {
+        return vld1_dup_s8_s16(ptr);
+    }
 };
 
 struct Vfmaq_laneq_f32 {
     template <const int lane>
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+    static __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
         return vfmaq_laneq_f32(a, b, v, lane);
     }
 };
 #if __ARM_FEATURE_DOTPROD
 struct Vdotq_laneq_s32 {
     template <const int lane>
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
+    static __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
         return vdotq_laneq_s32(a, b, v, lane);
     }
 };
@@ -72,4 +81,5 @@ struct Vdotq_laneq_s32 {
 }  // namespace
 }  // namespace megdnn
 
+#undef __ai
 // vim: syntax=cpp.doxygen
\ No newline at end of file
diff --git a/dnn/src/arm_common/simd_macro/marm_neon.h b/dnn/src/arm_common/simd_macro/marm_neon.h
index 0019b178f2aeeb288fc02d175d1feae2bd512264..8a6cde61e4abc35d4e17d75c6341ef973cf7754c 100644
--- a/dnn/src/arm_common/simd_macro/marm_neon.h
+++ b/dnn/src/arm_common/simd_macro/marm_neon.h
@@ -20,7 +20,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wattributes"
-#define __ai static inline __attribute__((__always_inline__, __nodebug__))
+#define __ai      \
+    static inline \
+            __attribute__((__gnu_inline__, __always_inline__, __nodebug__))
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !MEGDNN_DISABLE_FLOAT16
 #define MEGDNN_INC_ARM_FP16(_x) _x
@@ -299,16 +301,20 @@ __ai uint32x2_t vdot2_u8(uint8x8_t a, uint8x8_t b) {
 
 #endif  // __ARM_FEATURE_DOTPROD
 
+#if __GNUC__ < 8
 #undef vld1q_f32_x2
 __ai float32x4x2_t vld1q_f32_x2(const float* p) {
     return {{vld1q_f32(p), vld1q_f32(p + 4)}};
 }
+#endif
 
+#if __GNUC__ < 9
 #undef vst1q_f32_x2
 __ai void vst1q_f32_x2(const float* p, float32x4x2_t v) {
     vst1q_f32(const_cast<float*>(p), v.val[0]);
     vst1q_f32(const_cast<float*>(p) + 4, v.val[1]);
 }
+#endif
 
 __ai int8x16_t vtranslq_s8(int8x8_t a) {
     int8x16_t ret;
@@ -472,18 +478,18 @@ __ai int8x16_t vqtbl1q_s8(int8x16_t& a, uint8x16_t& idx) {
 namespace {
 template <int lane>
 struct Vdup_laneq_s16_armv7 {
-    static int16x4_t impl(int16x8_t vec);
+    __ai int16x4_t impl(int16x8_t vec);
 };
 #define cb(step)                                            \
     template <>                                             \
     struct Vdup_laneq_s16_armv7<step + 4> {                 \
-        static int16x4_t impl(int16x8_t vec) {              \
+        __ai int16x4_t impl(int16x8_t vec) {                \
             return vdup_lane_s16(vget_high_s16(vec), step); \
         }                                                   \
     };                                                      \
     template <>                                             \
     struct Vdup_laneq_s16_armv7<step> {                     \
-        static int16x4_t impl(int16x8_t vec) {              \
+        __ai int16x4_t impl(int16x8_t vec) {                \
             return vdup_lane_s16(vget_low_s16(vec), step);  \
         }                                                   \
     };
@@ -495,30 +501,30 @@ UNROLL_CALL_RAW(4, cb);
 namespace {
 template <int lane>
 struct Vfmaq_laneq_f32_armv7 {
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
 };
 
 template <>
 struct Vfmaq_laneq_f32_armv7<0> {
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
         return vmlaq_lane_f32(a, b, vget_low_f32(v), 0);
     }
 };
 template <>
 struct Vfmaq_laneq_f32_armv7<1> {
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
         return vmlaq_lane_f32(a, b, vget_low_f32(v), 1);
     }
 };
 template <>
 struct Vfmaq_laneq_f32_armv7<2> {
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
         return vmlaq_lane_f32(a, b, vget_high_f32(v), 0);
     }
 };
 template <>
 struct Vfmaq_laneq_f32_armv7<3> {
-    static float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
         return vmlaq_lane_f32(a, b, vget_high_f32(v), 1);
     }
 };
@@ -527,37 +533,98 @@ struct Vfmaq_laneq_f32_armv7<3> {
     Vfmaq_laneq_f32_armv7<lane>::impl(a, b, v)
 
 #if __ARM_FEATURE_DOTPROD
+namespace {
 template <int lane>
 struct Vdotq_laneq_s32_armv7 {
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v);
+    __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v);
 };
 template <>
 struct Vdotq_laneq_s32_armv7<0> {
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
+    __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
         return vdotq_lane_s32(a, b, vget_low_s32(v), 0);
     }
 };
 template <>
 struct Vdotq_laneq_s32_armv7<1> {
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
+    __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
         return vdotq_lane_s32(a, b, vget_low_s32(v), 1);
     }
 };
 template <>
 struct Vdotq_laneq_s32_armv7<2> {
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
+    __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
         return vdotq_lane_s32(a, b, vget_high_s32(v), 0);
     }
 };
 template <>
 struct Vdotq_laneq_s32_armv7<3> {
-    static int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
+    __ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
         return vdotq_lane_s32(a, b, vget_high_f32(v), 1);
     }
 };
 #define vdotq_laneq_s32(a, b, v, lane) \
     Vdotq_laneq_s32_armv7<lane>::impl(a, b, v)
 
+}  // namespace
+#endif
+
+#endif
+
+//! GCC split fmla with lane to dup+fmla when version < 9
+//! https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+#if !defined(__clang__) && __GNUC__ < 9
+#if MEGDNN_AARCH64
+namespace {
+
+template <int lane>
+struct Vfmaq_laneq_f32_armv8 {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
+};
+template <>
+struct Vfmaq_laneq_f32_armv8<0> {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+        asm volatile("fmla %0.4s, %1.4s, %2.s[0]\n"
+                     : "+w"(a)
+                     : "w"(b), "w"(v)
+                     :);
+        return a;
+    }
+};
+template <>
+struct Vfmaq_laneq_f32_armv8<1> {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+        asm volatile("fmla %0.4s, %1.4s, %2.s[1]\n"
+                     : "+w"(a)
+                     : "w"(b), "w"(v)
+                     :);
+        return a;
+    }
+};
+template <>
+struct Vfmaq_laneq_f32_armv8<2> {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+        asm volatile("fmla %0.4s, %1.4s, %2.s[2]\n"
+                     : "+w"(a)
+                     : "w"(b), "w"(v)
+                     :);
+        return a;
+    }
+};
+template <>
+struct Vfmaq_laneq_f32_armv8<3> {
+    __ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
+        asm volatile("fmla %0.4s, %1.4s, %2.s[3]\n"
+                     : "+w"(a)
+                     : "w"(b), "w"(v)
+                     :);
+        return a;
+    }
+};
+}  // namespace
+#undef vfmaq_laneq_f32
+#define vfmaq_laneq_f32(a, b, v, lane) \
+    Vfmaq_laneq_f32_armv8<lane>::impl(a, b, v)
+
 #endif
 
 #endif
diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp
index ef19dea7e034ac1226c18ab9f86927321358474e..d9f7bd72d92e2407999afd4973965e655c9bb7ef 100644
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -77,7 +77,7 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
         bool only_no_bias = false) {
     using namespace conv_bias;
     using NLMode = param::ConvBias::NonlineMode;
-    
+
     std::vector<TestArg> args;
 
     auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w,
@@ -172,11 +172,11 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
         bias_mode.emplace_back(megdnn::BiasMode::NO_BIAS);
     }
     if (support_full_bias) {
-      bias_mode.emplace_back(megdnn::BiasMode::BIAS);
+        bias_mode.emplace_back(megdnn::BiasMode::BIAS);
     }
     for (auto bias : bias_mode)
         for (auto nlmode : nonlinemode)
-            for (size_t n : {1,2})
+            for (size_t n : {1, 2})
                 for (size_t kernel : kernel_vec)
                     for (size_t oc : {4, 12})
                         for (size_t ic : {1, 3, 4, 12})
@@ -364,8 +364,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) {
 }
 
 TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S1_K7) {
-    check_conv_bias(get_nchw44_conv_bias_args({7}, 1, false, true, true,
-                                              false, false, false),
+    check_conv_bias(get_nchw44_conv_bias_args({7}, 1, false, true, true, false,
+                                              false, false),
                     handle(), "F32_CONV_NCHW44_DIRECT");
 }
 
@@ -403,10 +403,12 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2_SMALL_GROUP) {
     check_conv_bias(get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
                     handle(), "F32STRD2_SMALL_GROUP");
 }
-TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32) {
+TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32_S2) {
     check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false,
                                               false, true),
                     handle(), "F32_CONV_NCHW_NCHW44");
+}
+TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_NCHW_NCHW44_F32_S1) {
     check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false,
                                               false, true),
                     handle(), "F32_CONV_NCHW_NCHW44");
@@ -566,13 +568,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_QS8_CHANNEL_WISE_DIRECT2_NCHW44) {
             handle(), "S8_CHAN_WISE_STRD2_NCHW44");
 }
 
-TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44) {
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44_S1) {
     checker_conv_bias_qint8x8x8(
-            get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, false,
+            get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false,
                                       true),
             handle(), "S8_CONV_NCHW_NCHW44");
+}
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_NCHW_NCHW44_S2) {
     checker_conv_bias_qint8x8x8(
-            get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false,
+            get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, false,
                                       true),
             handle(), "S8_CONV_NCHW_NCHW44");
 }
@@ -1820,7 +1824,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) {
     using namespace conv_bias;
     std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
-            {2, 4, 7}, 1, false, false, false, false, false, true,true);
+            {2, 4, 7}, 1, false, false, false, false, false, true, true);
 #if MEGDNN_AARCH64
     check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
@@ -1841,7 +1845,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) {
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) {
     using namespace conv_bias;
     std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args(
-            {3}, 2, false, false, false, false, false, true, true,false);
+            {3}, 2, false, false, false, false, false, true, true, false);
 #if MEGDNN_AARCH64
     check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1");
 #elif MEGDNN_ARMV7
diff --git a/toolchains/aarch64-none-linux-gnu.toolchain.cmake b/toolchains/aarch64-none-linux-gnu.toolchain.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..df9c1ab4c2b21353104a1c6d6b64bc10eaf6108c
--- /dev/null
+++ b/toolchains/aarch64-none-linux-gnu.toolchain.cmake
@@ -0,0 +1,6 @@
+set(ARM_CROSS_BUILD_ARCH aarch64)
+set(CMAKE_C_COMPILER "aarch64-none-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "aarch64-none-linux-gnu-g++")
+set(CMAKE_C_FLAGS "-Werror=unused-parameter -Wno-psabi")
+set(CMAKE_CXX_FLAGS "-Werror=unused-parameter -Wno-psabi")
+set(CMAKE_STRIP "aarch64-none-linux-gnu-strip")