[arm] improve con3x3_dw (#4063)

* optimize conv_dw profiler * fix build conv_dw_3x3s1 bug * update conv_dw_3x3s2 * fxi foormat test=develop

[arm] improve con3x3_dw (#4063)
* optimize conv_dw profiler * fix build conv_dw_3x3s1 bug * update conv_dw_3x3s2 * fxi foormat test=develop
9e38adc8 · HappyAngel · GitHub · ee4cb1dc · 9e38adc8 · 9e38adc8
5 changed file
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -20,10 +20,11 @@ namespace lite {
 namespace arm {
 namespace math {

-void conv_depthwise_3x3s1p0_bias(float *dout,
+void conv_depthwise_3x3s1p1_bias_relu6(float *dout,
                                       const float *din,
                                       const float *weights,
                                       const float *bias,
+                                       const float *six,
                                       bool flag_bias,
                                       const int num,
                                       const int ch_in,
@@ -31,13 +32,13 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                       const int w_in,
                                       const int h_out,
                                       const int w_out,
-                                 const operators::ActivationParam act_param,
                                       ARMContext *ctx);

-void conv_depthwise_3x3s1p0_bias_s(float *dout,
+void conv_depthwise_3x3s1p1_bias_s_relu6(float *dout,
                                         const float *din,
                                         const float *weights,
                                         const float *bias,
+                                         const float *six,
                                         bool flag_bias,
                                         const int num,
                                         const int ch_in,
@@ -45,13 +46,13 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                         const int w_in,
                                         const int h_out,
                                         const int w_out,
-                                   const operators::ActivationParam act_param,
                                         ARMContext *ctx);

-void conv_depthwise_3x3s1p1_bias(float *dout,
+void conv_depthwise_3x3s1p0_bias_relu6(float *dout,
                                       const float *din,
                                       const float *weights,
                                       const float *bias,
+                                       const float *six,
                                       bool flag_bias,
                                       const int num,
                                       const int ch_in,
@@ -59,13 +60,69 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                       const int w_in,
                                       const int h_out,
                                       const int w_out,
-                                 const operators::ActivationParam act_param,
                                       ARMContext *ctx);

-void conv_depthwise_3x3s1p1_bias_s(float *dout,
+void conv_depthwise_3x3s1p0_bias_s_relu6(float *dout,
                                         const float *din,
                                         const float *weights,
                                         const float *bias,
+                                         const float *six,
+                                         bool flag_bias,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
+                                           const float *din,
+                                           const float *weights,
+                                           const float *bias,
+                                           const float *scale,
+                                           bool flag_bias,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias_s_leakyRelu(float *dout,
+                                             const float *din,
+                                             const float *weights,
+                                             const float *bias,
+                                             const float *scale,
+                                             bool flag_bias,
+                                             const int num,
+                                             const int ch_in,
+                                             const int h_in,
+                                             const int w_in,
+                                             const int h_out,
+                                             const int w_out,
+                                             ARMContext *ctx);
+
+void conv_depthwise_3x3s1p0_bias_leakyRelu(float *dout,
+                                           const float *din,
+                                           const float *weights,
+                                           const float *bias,
+                                           const float *scale,
+                                           bool flag_bias,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext *ctx);
+
+void conv_depthwise_3x3s1p0_bias_s_leakyRelu(float *dout,
+                                             const float *din,
+                                             const float *weights,
+                                             const float *bias,
+                                             const float *scale,
                                             bool flag_bias,
                                             const int num,
                                             const int ch_in,
@@ -73,7 +130,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                             const int w_in,
                                             const int h_out,
                                             const int w_out,
-                                   const operators::ActivationParam act_param,
                                             ARMContext *ctx);

 void conv_depthwise_3x3s1_fp32(const float *din,
@@ -92,22 +148,85 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                               const operators::ActivationParam act_param,
                               ARMContext *ctx) {
  bool has_active = act_param.has_active;
-  bool flag_relu = false;
-  bool relu6 = false;
+  auto act_type = act_param.active_type;
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
  if (has_active) {
-    if (act_param.active_type == lite_api::ActivationType::kRelu) {
-      flag_relu = true;
+    switch (act_type) {
+      case lite_api::ActivationType::kRelu:
+        if (pad == 0) {
+          if (w_in > 5) {
+            conv_depthwise_3x3s1p0_bias_relu(dout,
+                                             din,
+                                             weights,
+                                             bias,
+                                             flag_bias,
+                                             true,
+                                             num,
+                                             ch_in,
+                                             h_in,
+                                             w_in,
+                                             h_out,
+                                             w_out,
+                                             ctx);
+          } else {
+            conv_depthwise_3x3s1p0_bias_s_relu(dout,
+                                               din,
+                                               weights,
+                                               bias,
+                                               flag_bias,
+                                               true,
+                                               num,
+                                               ch_in,
+                                               h_in,
+                                               w_in,
+                                               h_out,
+                                               w_out,
+                                               ctx);
+          }
+        }
+        if (pad == 1) {
+          if (w_in > 4) {
+            conv_depthwise_3x3s1p1_bias_relu(dout,
+                                             din,
+                                             weights,
+                                             bias,
+                                             flag_bias,
+                                             true,
+                                             num,
+                                             ch_in,
+                                             h_in,
+                                             w_in,
+                                             h_out,
+                                             w_out,
+                                             ctx);
          } else {
-      relu6 = true;
+            conv_depthwise_3x3s1p1_bias_s_relu(dout,
+                                               din,
+                                               weights,
+                                               bias,
+                                               flag_bias,
+                                               true,
+                                               num,
+                                               ch_in,
+                                               h_in,
+                                               w_in,
+                                               h_out,
+                                               w_out,
+                                               ctx);
          }
        }
+        break;
+      case lite_api::ActivationType::kRelu6:
        if (pad == 0) {
          if (w_in > 5) {
-      if (relu6) {
-        conv_depthwise_3x3s1p0_bias(dout,
+            conv_depthwise_3x3s1p0_bias_relu6(dout,
                                              din,
                                              weights,
                                              bias,
+                                              vsix,
                                              flag_bias,
                                              num,
                                              ch_in,
@@ -115,15 +234,14 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                              w_in,
                                              h_out,
                                              w_out,
-                                    act_param,
                                              ctx);
          } else {
-        conv_depthwise_3x3s1p0_bias_relu(dout,
+            conv_depthwise_3x3s1p0_bias_s_relu6(dout,
                                                din,
                                                weights,
                                                bias,
+                                                vsix,
                                                flag_bias,
-                                         flag_relu,
                                                num,
                                                ch_in,
                                                h_in,
@@ -132,12 +250,14 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                w_out,
                                                ctx);
          }
-    } else {
-      if (relu6) {
-        conv_depthwise_3x3s1p0_bias_s(dout,
+        }
+        if (pad == 1) {
+          if (w_in > 4) {
+            conv_depthwise_3x3s1p1_bias_relu6(dout,
                                              din,
                                              weights,
                                              bias,
+                                              vsix,
                                              flag_bias,
                                              num,
                                              ch_in,
@@ -145,15 +265,14 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                              w_in,
                                              h_out,
                                              w_out,
-                                      act_param,
                                              ctx);
          } else {
-        conv_depthwise_3x3s1p0_bias_s_relu(dout,
+            conv_depthwise_3x3s1p1_bias_s_relu6(dout,
                                                din,
                                                weights,
                                                bias,
+                                                vsix,
                                                flag_bias,
-                                           flag_relu,
                                                num,
                                                ch_in,
                                                h_in,
@@ -163,14 +282,46 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                ctx);
          }
        }
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        if (pad == 0) {
+          if (w_in > 5) {
+            conv_depthwise_3x3s1p0_bias_leakyRelu(dout,
+                                                  din,
+                                                  weights,
+                                                  bias,
+                                                  vscale,
+                                                  flag_bias,
+                                                  num,
+                                                  ch_in,
+                                                  h_in,
+                                                  w_in,
+                                                  h_out,
+                                                  w_out,
+                                                  ctx);
+          } else {
+            conv_depthwise_3x3s1p0_bias_s_leakyRelu(dout,
+                                                    din,
+                                                    weights,
+                                                    bias,
+                                                    vscale,
+                                                    flag_bias,
+                                                    num,
+                                                    ch_in,
+                                                    h_in,
+                                                    w_in,
+                                                    h_out,
+                                                    w_out,
+                                                    ctx);
+          }
        }
        if (pad == 1) {
          if (w_in > 4) {
-      if (relu6) {
-        conv_depthwise_3x3s1p1_bias(dout,
+            conv_depthwise_3x3s1p1_bias_leakyRelu(dout,
                                                  din,
                                                  weights,
                                                  bias,
+                                                  vscale,
                                                  flag_bias,
                                                  num,
                                                  ch_in,
@@ -178,15 +329,14 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                  w_in,
                                                  h_out,
                                                  w_out,
-                                    act_param,
                                                  ctx);
          } else {
-        conv_depthwise_3x3s1p1_bias_relu(dout,
+            conv_depthwise_3x3s1p1_bias_s_leakyRelu(dout,
                                                    din,
                                                    weights,
                                                    bias,
+                                                    vscale,
                                                    flag_bias,
-                                         flag_relu,
                                                    num,
                                                    ch_in,
                                                    h_in,
@@ -195,28 +345,66 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                    w_out,
                                                    ctx);
          }
+        }
+        break;
+      default:
+        LOG(FATAL) << "this act_type: " << static_cast<int>(act_type)
+                   << " fuse not support";
+    }
  } else {
-      if (relu6) {
-        conv_depthwise_3x3s1p1_bias_s(dout,
+    if (pad == 0) {
+      if (w_in > 5) {
+        conv_depthwise_3x3s1p0_bias_no_relu(dout,
                                            din,
                                            weights,
                                            bias,
                                            flag_bias,
+                                            false,
                                            num,
                                            ch_in,
                                            h_in,
                                            w_in,
                                            h_out,
                                            w_out,
-                                      act_param,
                                            ctx);
      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout,
+        conv_depthwise_3x3s1p0_bias_s_no_relu(dout,
+                                              din,
+                                              weights,
+                                              bias,
+                                              flag_bias,
+                                              false,
+                                              num,
+                                              ch_in,
+                                              h_in,
+                                              w_in,
+                                              h_out,
+                                              w_out,
+                                              ctx);
+      }
+    }
+    if (pad == 1) {
+      if (w_in > 4) {
+        conv_depthwise_3x3s1p1_bias_no_relu(dout,
+                                            din,
+                                            weights,
+                                            bias,
+                                            flag_bias,
+                                            false,
+                                            num,
+                                            ch_in,
+                                            h_in,
+                                            w_in,
+                                            h_out,
+                                            w_out,
+                                            ctx);
+      } else {
+        conv_depthwise_3x3s1p1_bias_s_no_relu(dout,
                                              din,
                                              weights,
                                              bias,
                                              flag_bias,
-                                           flag_relu,
+                                              false,
                                              num,
                                              ch_in,
                                              h_in,
@@ -1978,82 +2166,169 @@ void conv_depthwise_3x3s1_fp32(const float *din,

 #endif

+void conv_depthwise_3x3s1p1_bias_relu6(float *dout,
+                                       const float *din,
+                                       const float *weights,
+                                       const float *bias,
+                                       const float *six,
+                                       bool flag_bias,
+                                       const int num,
+                                       const int ch_in,
+                                       const int h_in,
+                                       const int w_in,
+                                       const int h_out,
+                                       const int w_out,
+                                       ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+  int cnt_col = tile_w - 1;
+
+  unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
+  const unsigned int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 5) {
+    size_pad_right = 1;
+    cnt_col -= 1;
+    remain = 4;
+  } else if (remain == 0 && size_pad_right == 6) {
+    size_pad_right = 2;
+    cnt_col -= 1;
+    remain = 4;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
 #ifdef __aarch64__
-void act_switch_3x3s1p1(const float *din_ptr0,
-                        const float *din_ptr1,
-                        const float *din_ptr2,
-                        const float *din_ptr3,
-                        const float *din_ptr4,
-                        const float *din_ptr5,
-                        float *doutr0,
-                        float *doutr1,
-                        float *doutr2,
-                        float *doutr3,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        unsigned int *vmask,
-                        unsigned int *rmask,
-                        float32x4_t vzero,
-                        float *vbias,
-                        int cnt,
-                        const operators::ActivationParam act_param) {
-  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-
-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                       MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-                   : [cnt] "+r"(cnt),
-                     [din_ptr0] "+r"(din_ptr0),
-                     [din_ptr1] "+r"(din_ptr1),
-                     [din_ptr2] "+r"(din_ptr2),
-                     [din_ptr3] "+r"(din_ptr3),
-                     [din_ptr4] "+r"(din_ptr4),
-                     [din_ptr5] "+r"(din_ptr5),
-                     [doutr0] "+r"(doutr0),
-                     [doutr1] "+r"(doutr1),
-                     [doutr2] "+r"(doutr2),
-                     [doutr3] "+r"(doutr3)
-                   : [w0] "w"(wr0),
-                     [w1] "w"(wr1),
-                     [w2] "w"(wr2),
-                     [bias_val] "r"(vbias),
-                     [vmask] "r"(vmask),
-                     [rmask] "r"(rmask),
-                     [vzero] "w"(vzero)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17",
-                     "v18",
-                     "v19",
-                     "v20",
-                     "v21",
-                     "v22",
-                     "v23",
-                     "v24",
-                     "v25");
+  float32x4_t vsix = vld1q_f32(six);
+#endif
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
              break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
        asm volatile(
            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
@@ -2104,90 +2379,57 @@ void act_switch_3x3s1p1(const float *din_ptr0,
              "v23",
              "v24",
              "v25");
-      break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                       MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_LEAKY_RELU
-                   : [cnt] "+r"(cnt),
-                     [din_ptr0] "+r"(din_ptr0),
-                     [din_ptr1] "+r"(din_ptr1),
-                     [din_ptr2] "+r"(din_ptr2),
-                     [din_ptr3] "+r"(din_ptr3),
-                     [din_ptr4] "+r"(din_ptr4),
-                     [din_ptr5] "+r"(din_ptr5),
-                     [doutr0] "+r"(doutr0),
-                     [doutr1] "+r"(doutr1),
-                     [doutr2] "+r"(doutr2),
-                     [doutr3] "+r"(doutr3)
-                   : [w0] "w"(wr0),
-                     [w1] "w"(wr1),
-                     [w2] "w"(wr2),
-                     [vscale] "w"(vscale),
-                     [bias_val] "r"(vbias),
-                     [vmask] "r"(vmask),
-                     [rmask] "r"(rmask),
-                     [vzero] "w"(vzero)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17",
-                     "v18",
-                     "v19",
-                     "v20",
-                     "v21",
-                     "v22",
-                     "v23",
-                     "v24",
-                     "v25");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+        dout_ptr = dout_ptr + 4 * w_out;
      }
-}
 #else
-void act_switch_3x3s1p1(const float *din_ptr0,
-                        const float *din_ptr1,
-                        const float *din_ptr2,
-                        const float *din_ptr3,
-                        float *doutr0,
-                        float *doutr1,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        unsigned int *vmask_ptr,
-                        unsigned int *rmask_ptr,
-                        float32x4_t vzero,
-                        float bias_val,
-                        int cnt,
-                        const operators::ActivationParam act_param) {
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
+      for (int i = 0; i < h_out; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;

-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                       MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
            : [dout_ptr1] "+r"(doutr0),
              [dout_ptr2] "+r"(doutr1),
              [din0_ptr] "+r"(din_ptr0),
@@ -2201,6 +2443,7 @@ void act_switch_3x3s1p1(const float *din_ptr0,
              [wr1] "w"(wr1),
              [wr2] "w"(wr2),
              [bias_val] "r"(bias_val),
+              [six_ptr] "r"(six),
              [vzero] "w"(vzero)
            : "cc",
              "memory",
@@ -2216,91 +2459,18 @@ void act_switch_3x3s1p1(const float *din_ptr0,
              "q13",
              "q14",
              "q15");
-      break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
-      asm volatile(
-          INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
-              MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
-          : [dout_ptr1] "+r"(doutr0),
-            [dout_ptr2] "+r"(doutr1),
-            [din0_ptr] "+r"(din_ptr0),
-            [din1_ptr] "+r"(din_ptr1),
-            [din2_ptr] "+r"(din_ptr2),
-            [din3_ptr] "+r"(din_ptr3),
-            [cnt] "+r"(cnt),
-            [rmask] "+r"(rmask_ptr),
-            [vmask] "+r"(vmask_ptr)
-          : [wr0] "w"(wr0),
-            [wr1] "w"(wr1),
-            [wr2] "w"(wr2),
-            [bias_val] "r"(bias_val),
-            [six_ptr] "r"(vsix),
-            [vzero] "w"(vzero)
-          : "cc",
-            "memory",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11",
-            "q12",
-            "q13",
-            "q14",
-            "q15");
-      break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                       MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_LEAKY_RELU
-                   : [dout_ptr1] "+r"(doutr0),
-                     [dout_ptr2] "+r"(doutr1),
-                     [din0_ptr] "+r"(din_ptr0),
-                     [din1_ptr] "+r"(din_ptr1),
-                     [din2_ptr] "+r"(din_ptr2),
-                     [din3_ptr] "+r"(din_ptr3),
-                     [cnt] "+r"(cnt),
-                     [rmask] "+r"(rmask_ptr),
-                     [vmask] "+r"(vmask_ptr)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [bias_val] "r"(bias_val),
-                     [scale_ptr] "r"(vscale),
-                     [vzero] "w"(vzero)
-                   : "cc",
-                     "memory",
-                     "q4",
-                     "q5",
-                     "q6",
-                     "q7",
-                     "q8",
-                     "q9",
-                     "q10",
-                     "q11",
-                     "q12",
-                     "q13",
-                     "q14",
-                     "q15");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
  }
 }
-#endif
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p1_bias(float *dout,
+
+void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
                                           const float *din,
                                           const float *weights,
                                           const float *bias,
+                                           const float *scale,
                                           bool flag_bias,
                                           const int num,
                                           const int ch_in,
@@ -2308,7 +2478,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                           const int w_in,
                                           const int h_out,
                                           const int w_out,
-                                 const operators::ActivationParam act_param,
                                           ARMContext *ctx) {
  //! pad is done implicit
  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
@@ -2355,7 +2524,9 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
  vst1q_u32(rmask, vmask_result);

  float32x4_t vzero = vdupq_n_f32(0.f);
-
+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+#endif
  for (int n = 0; n < num; ++n) {
    const float *din_batch = din + n * ch_in * size_in_channel;
    float *dout_batch = dout + n * ch_in * size_out_channel;
@@ -2458,25 +2629,56 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
        }

        int cnt = cnt_col;
-        act_switch_3x3s1p1(din_ptr0,
-                           din_ptr1,
-                           din_ptr2,
-                           din_ptr3,
-                           din_ptr4,
-                           din_ptr5,
-                           doutr0,
-                           doutr1,
-                           doutr2,
-                           doutr3,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask,
-                           rmask,
-                           vzero,
-                           vbias,
-                           cnt,
-                           act_param);
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [cnt] "+r"(cnt),
+                       [din_ptr0] "+r"(din_ptr0),
+                       [din_ptr1] "+r"(din_ptr1),
+                       [din_ptr2] "+r"(din_ptr2),
+                       [din_ptr3] "+r"(din_ptr3),
+                       [din_ptr4] "+r"(din_ptr4),
+                       [din_ptr5] "+r"(din_ptr5),
+                       [doutr0] "+r"(doutr0),
+                       [doutr1] "+r"(doutr1),
+                       [doutr2] "+r"(doutr2),
+                       [doutr3] "+r"(doutr3)
+                     : [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [vscale] "w"(vscale),
+                       [bias_val] "r"(vbias),
+                       [vmask] "r"(vmask),
+                       [rmask] "r"(rmask),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "v23",
+                       "v24",
+                       "v25");
        dout_ptr = dout_ptr + 4 * w_out;
      }
 #else
@@ -2525,100 +2727,28 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
        int cnt = cnt_col;
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
-        act_switch_3x3s1p1(din_ptr0,
-                           din_ptr1,
-                           din_ptr2,
-                           din_ptr3,
-                           doutr0,
-                           doutr1,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask_ptr,
-                           rmask_ptr,
-                           vzero,
-                           bias_val,
-                           cnt,
-                           act_param);
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-void act_switch_3x3s1p1_s(const float *din_ptr0,
-                          const float *din_ptr1,
-                          const float *din_ptr2,
-                          const float *din_ptr3,
-                          float *doutr0,
-                          float *doutr1,
-                          float32x4_t wr0,
-                          float32x4_t wr1,
-                          float32x4_t wr2,
-                          uint32x4_t vmask_rp,
-                          float32x4_t vzero,
-                          float32x4_t wbias,
-                          const operators::ActivationParam act_param) {
-#ifdef __aarch64__
-  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-#else
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
-#endif
-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-#ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [mask] "w"(vmask_rp),
-                     [bias] "w"(wbias),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17");
-      break;
-#else
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
                     : [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [mask] "w"(vmask_rp),
-                     [bias] "w"(wbias),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
+                       [bias_val] "r"(bias_val),
+                       [scale_ptr] "r"(scale),
+                       [vzero] "w"(vzero)
                     : "cc",
                       "memory",
+                       "q4",
+                       "q5",
                       "q6",
                       "q7",
                       "q8",
@@ -2629,77 +2759,104 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                       "q13",
                       "q14",
                       "q15");
-      break;
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
 #endif
-    case lite_api::ActivationType::kRelu6:
-/* 0 <= din <= 6 */
+    }
+  }
+}
+
+void conv_depthwise_3x3s1p1_bias_s_relu6(float *dout,
+                                         const float *din,
+                                         const float *weights,
+                                         const float *bias,
+                                         const float *six,
+                                         bool flag_bias,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext *ctx) {
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
 #ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [mask] "w"(vmask_rp),
-                     [bias] "w"(wbias),
-                     [vsix] "w"(vsix),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17");
-      break;
-#else
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [mask] "w"(vmask_rp),
-                     [bias] "w"(wbias),
-                     [six_ptr] "r"(vsix),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "cc",
-                     "memory",
-                     "q6",
-                     "q7",
-                     "q8",
-                     "q9",
-                     "q10",
-                     "q11",
-                     "q12",
-                     "q13",
-                     "q14",
-                     "q15");
-      break;
+  float32x4_t vsix = vld1q_f32(six);
 #endif
-    case lite_api::ActivationType::kLeakyRelu:
-/*din = din >= 0 ? din : din * scale*/
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      const float *dr0 = din_channel;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *din_ptr0 = dr0;
+        const float *din_ptr1 = dr1;
+        const float *din_ptr2 = dr2;
+        const float *din_ptr3 = dr3;
+        if (j == 0) {
+          din_ptr0 = zero;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (j + 3 > h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero;
+            case 2:
+              din_ptr2 = zero;
+            case 1:
+              din_ptr3 = zero;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
+        }
 #ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
                     : [din0] "+r"(din_ptr0),
                       [din1] "+r"(din_ptr1),
                       [din2] "+r"(din_ptr2),
@@ -2710,7 +2867,7 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                       [vzero] "w"(vzero),
                       [mask] "w"(vmask_rp),
                       [bias] "w"(wbias),
-                     [vscale] "w"(vscale),
+                       [vsix] "w"(vsix),
                       [out1] "r"(doutr0),
                       [out2] "r"(doutr1)
                     : "v0",
@@ -2730,13 +2887,9 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                       "v14",
                       "v15",
                       "v16",
-                     "v17",
-                     "v18",
-                     "v19",
-                     "v20");
-      break;
+                       "v17");
 #else
-      asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
                     : [din0] "+r"(din_ptr0),
                       [din1] "+r"(din_ptr1),
                       [din2] "+r"(din_ptr2),
@@ -2747,7 +2900,7 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                       [vzero] "w"(vzero),
                       [mask] "w"(vmask_rp),
                       [bias] "w"(wbias),
-                     [scale_ptr] "r"(vscale),
+                       [six_ptr] "r"(six),
                       [out1] "r"(doutr0),
                       [out2] "r"(doutr1)
                     : "cc",
@@ -2762,21 +2915,23 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                       "q13",
                       "q14",
                       "q15");
-      break;
 #endif
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
 }
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float *dout,
+
+void conv_depthwise_3x3s1p1_bias_s_leakyRelu(float *dout,
                                             const float *din,
                                             const float *weights,
                                             const float *bias,
+                                             const float *scale,
                                             bool flag_bias,
                                             const int num,
                                             const int ch_in,
@@ -2784,11 +2939,7 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                             const int w_in,
                                             const int h_out,
                                             const int w_out,
-                                   const operators::ActivationParam act_param,
                                             ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
  const int right_pad_idx[4] = {3, 2, 1, 0};
  const float zero[4] = {0.f, 0.f, 0.f, 0.f};

@@ -2797,6 +2948,9 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
  int size_in_channel = w_in * h_in;
  int size_out_channel = w_out * h_out;
+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+#endif
  for (int n = 0; n < num; ++n) {
    const float *din_batch = din + n * ch_in * size_in_channel;
    float *dout_batch = dout + n * ch_in * size_out_channel;
@@ -2862,90 +3016,22 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
        if (j + 2 > h_out) {
          doutr1 = trash_buf;
        }
-        act_switch_3x3s1p1_s(dr0_ptr,
-                             dr1_ptr,
-                             dr2_ptr,
-                             dr3_ptr,
-                             out_buf1,
-                             out_buf2,
-                             wr0,
-                             wr1,
-                             wr2,
-                             vmask_rp,
-                             vzero,
-                             wbias,
-                             act_param);
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
 #ifdef __aarch64__
-void act_switch_3x3s1p0(const float *din_ptr0,
-                        const float *din_ptr1,
-                        const float *din_ptr2,
-                        const float *din_ptr3,
-                        const float *din_ptr4,
-                        const float *din_ptr5,
-                        float *doutr0,
-                        float *doutr1,
-                        float *doutr2,
-                        float *doutr3,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        unsigned int *vmask,
-                        unsigned int *rmask,
-                        float32x4_t vzero,
-                        float *vbias,
-                        int cnt,
-                        int remain,
-                        const operators::ActivationParam act_param) {
-  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-
-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-      asm volatile(
-          INIT_S1
-          "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-          "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-          "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-          "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-          "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-          "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-          MID_COMPUTE_S1 MID_RESULT_S1_RELU
-          "cmp  %w[remain], #1             \n"
-          "blt 0f                         \n" RIGHT_COMPUTE_S1
-              RIGHT_RESULT_S1_RELU "0: \n"
-          : [cnt] "+r"(cnt),
-            [din_ptr0] "+r"(din_ptr0),
-            [din_ptr1] "+r"(din_ptr1),
-            [din_ptr2] "+r"(din_ptr2),
-            [din_ptr3] "+r"(din_ptr3),
-            [din_ptr4] "+r"(din_ptr4),
-            [din_ptr5] "+r"(din_ptr5),
-            [doutr0] "+r"(doutr0),
-            [doutr1] "+r"(doutr1),
-            [doutr2] "+r"(doutr2),
-            [doutr3] "+r"(doutr3)
-          : [w0] "w"(wr0),
-            [w1] "w"(wr1),
-            [w2] "w"(wr2),
-            [bias_val] "r"(vbias),
-            [vmask] "r"(vmask),
-            [rmask] "r"(rmask),
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(dr0_ptr),
+                       [din1] "+r"(dr1_ptr),
+                       [din2] "+r"(dr2_ptr),
+                       [din3] "+r"(dr3_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
                       [vzero] "w"(vzero),
-            [remain] "r"(remain)
-          : "cc",
-            "memory",
-            "v0",
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
                       "v1",
                       "v2",
                       "v3",
@@ -2962,18 +3048,196 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                       "v14",
                       "v15",
                       "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22",
-            "v23",
-            "v24",
-            "v25");
+                       "v17");
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(dr0_ptr),
+                       [din1] "+r"(dr1_ptr),
+                       [din2] "+r"(dr2_ptr),
+                       [din3] "+r"(dr3_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [scale_ptr] "r"(scale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+void conv_depthwise_3x3s1p0_bias_relu6(float *dout,
+                                       const float *din,
+                                       const float *weights,
+                                       const float *bias,
+                                       const float *six,
+                                       bool flag_bias,
+                                       const int num,
+                                       const int ch_in,
+                                       const int h_in,
+                                       const int w_in,
+                                       const int h_out,
+                                       const int w_out,
+                                       ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+#ifdef __aarch64__
+  float32x4_t vsix = vld1q_f32(six);
+#endif
+
+  if (remain == 0 && size_pad_right == 6) {  // w_in == w_out and w_out % 4 == 0
+    tile_w -= 1;
+    remain = 4;
+    size_pad_right = 2;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
              break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
        asm volatile(
            INIT_S1
            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
@@ -3033,98 +3297,42 @@ void act_switch_3x3s1p0(const float *din_ptr0,
              "v23",
              "v24",
              "v25");
-      break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(
-          INIT_S1
-          "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-          "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-          "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-          "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-          "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-          "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-          MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-          "cmp  %w[remain], #1             \n"
-          "blt 0f                         \n" RIGHT_COMPUTE_S1
-              RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
-          : [cnt] "+r"(cnt),
-            [din_ptr0] "+r"(din_ptr0),
-            [din_ptr1] "+r"(din_ptr1),
-            [din_ptr2] "+r"(din_ptr2),
-            [din_ptr3] "+r"(din_ptr3),
-            [din_ptr4] "+r"(din_ptr4),
-            [din_ptr5] "+r"(din_ptr5),
-            [doutr0] "+r"(doutr0),
-            [doutr1] "+r"(doutr1),
-            [doutr2] "+r"(doutr2),
-            [doutr3] "+r"(doutr3)
-          : [w0] "w"(wr0),
-            [w1] "w"(wr1),
-            [w2] "w"(wr2),
-            [vscale] "w"(vscale),
-            [bias_val] "r"(vbias),
-            [vmask] "r"(vmask),
-            [rmask] "r"(rmask),
-            [remain] "r"(remain)
-          : "cc",
-            "memory",
-            "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22",
-            "v23",
-            "v24",
-            "v25");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+        dout_ptr = dout_ptr + 4 * w_out;
      }
-}
 #else
-void act_switch_3x3s1p0(const float *din_ptr0,
-                        const float *din_ptr1,
-                        const float *din_ptr2,
-                        const float *din_ptr3,
-                        float *doutr0,
-                        float *doutr1,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        unsigned int *vmask_ptr,
-                        unsigned int *rmask_ptr,
-                        float32x4_t vzero,
-                        float bias_val,
-                        int cnt,
-                        int remain,
-                        const operators::ActivationParam act_param) {
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;

-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
        asm volatile(INIT_S1
                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
@@ -3132,10 +3340,10 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
                     "vext.32  q6, q8, q9, #1     @ 0012\n"
                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                       MID_RESULT_S1_RELU
+                         MID_RESULT_S1_RELU6
                     "cmp  %[remain], #1             \n"
                     "blt 0f                         \n" RIGHT_COMPUTE_S1
-                       RIGHT_RESULT_S1_RELU "0:                         \n"
+                         RIGHT_RESULT_S1_RELU6 "0:                         \n"
                     : [dout_ptr1] "+r"(doutr0),
                       [dout_ptr2] "+r"(doutr1),
                       [din0_ptr] "+r"(din_ptr0),
@@ -3148,6 +3356,7 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                     : [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
+                       [six_ptr] "r"(six),
                       [bias_val] "r"(bias_val),
                       [vzero] "w"(vzero),
                       [remain] "r"(remain)
@@ -3165,81 +3374,146 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                       "q13",
                       "q14",
                       "q15");
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+void conv_depthwise_3x3s1p0_bias_s_relu6(float *dout,
+                                         const float *din,
+                                         const float *weights,
+                                         const float *bias,
+                                         const float *six,
+                                         bool flag_bias,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext *ctx) {
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+#ifdef __aarch64__
+  float32x4_t vsix = vld1q_f32(six);
+#endif
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t wbias;
+      float bias_val = 0.f;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+        bias_val = bias[i];
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 4 > h_in) {
+          switch (j + 4 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+            default:
              break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
-      asm volatile(INIT_S1
-                   "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "vext.32  q6, q8, q9, #1     @ 0012\n"
-                   "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                       MID_RESULT_S1_RELU6
-                   "cmp  %[remain], #1             \n"
-                   "blt 0f                         \n" RIGHT_COMPUTE_S1
-                       RIGHT_RESULT_S1_RELU6 "0:                         \n"
-                   : [dout_ptr1] "+r"(doutr0),
-                     [dout_ptr2] "+r"(doutr1),
-                     [din0_ptr] "+r"(din_ptr0),
-                     [din1_ptr] "+r"(din_ptr1),
-                     [din2_ptr] "+r"(din_ptr2),
-                     [din3_ptr] "+r"(din_ptr3),
-                     [cnt] "+r"(cnt),
-                     [rmask] "+r"(rmask_ptr),
+          }
+        }
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
+        }
+        unsigned int *vmask_ptr = vmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vsix] "w"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3),
                       [vmask] "+r"(vmask_ptr)
                     : [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                     [six_ptr] "r"(vsix),
-                     [bias_val] "r"(bias_val),
                       [vzero] "w"(vzero),
-                     [remain] "r"(remain)
-                   : "cc",
-                     "memory",
-                     "q4",
-                     "q5",
-                     "q6",
-                     "q7",
-                     "q8",
-                     "q9",
-                     "q10",
-                     "q11",
-                     "q12",
-                     "q13",
-                     "q14",
-                     "q15");
-      break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(INIT_S1
-                   "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                   "vext.32  q6, q8, q9, #1     @ 0012\n"
-                   "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                       MID_RESULT_S1_LEAKY_RELU
-                   "cmp  %[remain], #1             \n"
-                   "blt 0f                         \n" RIGHT_COMPUTE_S1
-                       RIGHT_RESULT_S1_LEAKY_RELU
-                   "0:                         \n"
-                   : [dout_ptr1] "+r"(doutr0),
-                     [dout_ptr2] "+r"(doutr1),
-                     [din0_ptr] "+r"(din_ptr0),
-                     [din1_ptr] "+r"(din_ptr1),
-                     [din2_ptr] "+r"(din_ptr2),
-                     [din3_ptr] "+r"(din_ptr3),
-                     [cnt] "+r"(cnt),
-                     [rmask] "+r"(rmask_ptr),
-                     [vmask] "+r"(vmask_ptr)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [scale_ptr] "r"(vscale),
+                       [six_ptr] "r"(six),
                       [bias_val] "r"(bias_val),
-                     [vzero] "w"(vzero),
-                     [remain] "r"(remain)
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
                     : "cc",
                       "memory",
                       "q4",
@@ -3254,21 +3528,21 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                       "q13",
                       "q14",
                       "q15");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
 }
-#endif
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p0_bias(float *dout,
+
+void conv_depthwise_3x3s1p0_bias_leakyRelu(float *dout,
                                           const float *din,
                                           const float *weights,
                                           const float *bias,
+                                           const float *scale,
                                           bool flag_bias,
                                           const int num,
                                           const int ch_in,
@@ -3276,7 +3550,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                           const int w_in,
                                           const int h_out,
                                           const int w_out,
-                                 const operators::ActivationParam act_param,
                                           ARMContext *ctx) {
  //! pad is done implicit
  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
@@ -3297,6 +3570,10 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
  const int remian_idx[4] = {0, 1, 2, 3};

+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+#endif
+
  if (remain == 0 && size_pad_right == 6) {  // w_in == w_out and w_out % 4 == 0
    tile_w -= 1;
    remain = 4;
@@ -3372,300 +3649,76 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
        doutr2 = doutr1 + w_out;
        doutr3 = doutr2 + w_out;

-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 4:
-              din_ptr1 = zero_ptr;
-            case 3:
-              din_ptr2 = zero_ptr;
-            case 2:
-              din_ptr3 = zero_ptr;
-            case 1:
-              din_ptr4 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        act_switch_3x3s1p0(din_ptr0,
-                           din_ptr1,
-                           din_ptr2,
-                           din_ptr3,
-                           din_ptr4,
-                           din_ptr5,
-                           doutr0,
-                           doutr1,
-                           doutr2,
-                           doutr3,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask,
-                           rmask,
-                           vzero,
-                           vbias,
-                           cnt,
-                           remain,
-                           act_param);
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i += 2) {
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = dout_ptr + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        act_switch_3x3s1p0(din_ptr0,
-                           din_ptr1,
-                           din_ptr2,
-                           din_ptr3,
-                           doutr0,
-                           doutr1,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask_ptr,
-                           rmask_ptr,
-                           vzero,
-                           bias_val,
-                           cnt,
-                           remain,
-                           act_param);
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-void act_switch_3x3s1p0_s(const float *din_ptr0,
-                          const float *din_ptr1,
-                          const float *din_ptr2,
-                          const float *din_ptr3,
-                          float *doutr0,
-                          float *doutr1,
-                          float32x4_t wr0,
-                          float32x4_t wr1,
-                          float32x4_t wr2,
-                          uint32x4_t vmask_rp1,
-                          uint32x4_t vmask_rp2,
-                          float32x4_t vzero,
-                          float32x4_t wbias,
-                          unsigned int *vmask_ptr,
-                          float bias_val,
-                          const operators::ActivationParam act_param) {
-#ifdef __aarch64__
-  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-#else
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
-#endif
-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-#ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vbias] "w"(wbias),
-                     [mask1] "w"(vmask_rp1),
-                     [mask2] "w"(vmask_rp2),
-                     [vzero] "w"(vzero),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15");
-      break;
-#else
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3),
-                     [vmask] "+r"(vmask_ptr)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [bias_val] "r"(bias_val),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "cc",
-                     "memory",
-                     "q4",
-                     "q5",
-                     "q6",
-                     "q7",
-                     "q8",
-                     "q9",
-                     "q10",
-                     "q11",
-                     "q12",
-                     "q13",
-                     "q14",
-                     "q15");
-      break;
-#endif
-    case lite_api::ActivationType::kRelu6:
-/* 0 <= din <= 6 */
-#ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vbias] "w"(wbias),
-                     [mask1] "w"(vmask_rp1),
-                     [mask2] "w"(vmask_rp2),
-                     [vzero] "w"(vzero),
-                     [vsix] "w"(vsix),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15");
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
              break;
-#else
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3),
-                     [vmask] "+r"(vmask_ptr)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [six_ptr] "r"(vsix),
-                     [bias_val] "r"(bias_val),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
-                   : "cc",
-                     "memory",
-                     "q4",
-                     "q5",
-                     "q6",
-                     "q7",
-                     "q8",
-                     "q9",
-                     "q10",
-                     "q11",
-                     "q12",
-                     "q13",
-                     "q14",
-                     "q15");
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
              break;
-#endif
-    case lite_api::ActivationType::kLeakyRelu:
-/*din = din >= 0 ? din : din * scale*/
-#ifdef __aarch64__
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3)
-                   : [wr0] "w"(wr0),
-                     [wr1] "w"(wr1),
-                     [wr2] "w"(wr2),
-                     [vbias] "w"(wbias),
-                     [mask1] "w"(vmask_rp1),
-                     [mask2] "w"(vmask_rp2),
-                     [vzero] "w"(vzero),
+          }
+        }
+
+        int cnt = tile_w;
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
              [vscale] "w"(vscale),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [remain] "r"(remain)
            : "cc",
              "memory",
              "v0",
@@ -3683,23 +3736,81 @@ void act_switch_3x3s1p0_s(const float *din_ptr0,
              "v12",
              "v13",
              "v14",
-                     "v15");
-      break;
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
 #else
-      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
-                   : [din0] "+r"(din_ptr0),
-                     [din1] "+r"(din_ptr1),
-                     [din2] "+r"(din_ptr2),
-                     [din3] "+r"(din_ptr3),
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_LEAKY_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_LEAKY_RELU
+                     "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
                       [vmask] "+r"(vmask_ptr)
                     : [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                     [vzero] "w"(vzero),
-                     [scale_ptr] "r"(vscale),
+                       [scale_ptr] "r"(scale),
                       [bias_val] "r"(bias_val),
-                     [out1] "r"(doutr0),
-                     [out2] "r"(doutr1)
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
                     : "cc",
                       "memory",
                       "q4",
@@ -3714,21 +3825,18 @@ void act_switch_3x3s1p0_s(const float *din_ptr0,
                       "q13",
                       "q14",
                       "q15");
-      break;
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
 #endif
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+    }
  }
 }
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float *dout,
+
+void conv_depthwise_3x3s1p0_bias_s_leakyRelu(float *dout,
                                             const float *din,
                                             const float *weights,
                                             const float *bias,
+                                             const float *scale,
                                             bool flag_bias,
                                             const int num,
                                             const int ch_in,
@@ -3736,11 +3844,7 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                             const int w_in,
                                             const int h_out,
                                             const int w_out,
-                                   const operators::ActivationParam act_param,
                                             ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};

@@ -3750,6 +3854,10 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
  uint32x4_t vmask_rp2 =
      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));

+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+#endif
+
  unsigned int vmask[8];
  vst1q_u32(vmask, vmask_rp1);
  vst1q_u32(vmask + 4, vmask_rp2);
@@ -3808,22 +3916,70 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
          doutr1 = trash_buf;
        }
        unsigned int *vmask_ptr = vmask;
-        act_switch_3x3s1p0_s(dr0,
-                             dr1,
-                             dr2,
-                             dr3,
-                             out_buf1,
-                             out_buf2,
-                             wr0,
-                             wr1,
-                             wr2,
-                             vmask_rp1,
-                             vmask_rp2,
-                             vzero,
-                             wbias,
-                             vmask_ptr,
-                             bias_val,
-                             act_param);
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [scale_ptr] "r"(scale),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
        for (int w = 0; w < w_out; ++w) {
          *doutr0++ = out_buf1[w];
          *doutr1++ = out_buf2[w];

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
@@ -1202,7 +1202,7 @@ namespace math {
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width > 4
 */
-void conv_depthwise_3x3s1p1_bias_relu(float *dout,
+void conv_depthwise_3x3s1p1_bias_no_relu(float *dout,
                                         const float *din,
                                         const float *weights,
                                         const float *bias,
@@ -1363,10 +1363,8 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
        }

        int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                         MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
                     : [cnt] "+r"(cnt),
                       [din_ptr0] "+r"(din_ptr0),
                       [din_ptr1] "+r"(din_ptr1),
@@ -1413,9 +1411,256 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
                       "v23",
                       "v24",
                       "v25");
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+        // unsigned int* rst_mask = rmask;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
                         MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+void conv_depthwise_3x3s1p1_bias_relu(float *dout,
+                                      const float *din,
+                                      const float *weights,
+                                      const float *bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+  int cnt_col = tile_w - 1;
+
+  unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
+  const unsigned int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 5) {
+    size_pad_right = 1;
+    cnt_col -= 1;
+    remain = 4;
+  } else if (remain == 0 && size_pad_right == 6) {
+    size_pad_right = 2;
+    cnt_col -= 1;
+    remain = 4;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
            : [cnt] "+r"(cnt),
              [din_ptr0] "+r"(din_ptr0),
              [din_ptr1] "+r"(din_ptr1),
@@ -1462,7 +1707,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
              "v23",
              "v24",
              "v25");
-        }
        dout_ptr = dout_ptr + 4 * w_out;
      }
 #else
@@ -1512,7 +1756,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
        int cnt = cnt_col;
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
        asm volatile(
            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
@@ -1544,38 +1787,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
              "q13",
              "q14",
              "q15");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
        dout_ptr += 2 * w_out;
      }  //! end of processing mid rows
 #endif
@@ -1587,7 +1798,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width <= 4
 */
-void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
+void conv_depthwise_3x3s1p1_bias_s_no_relu(float *dout,
                                           const float *din,
                                           const float *weights,
                                           const float *bias,
@@ -1660,8 +1871,7 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
            break;
        }
 #ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
                       [din2] "+r"(dr2),
@@ -1692,8 +1902,119 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
                       "v15",
                       "v16",
                       "v17");
-        } else {
+#else
        asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [out1] "r"(out_buf1),
+                       [out2] "r"(out_buf2)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+        hs += 2;
+        he += 2;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
+                                        const float *din,
+                                        const float *weights,
+                                        const float *bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      int hs = -1;
+      int he = 3;
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      int h_cnt = (h_out + 1) >> 1;
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_cnt; ++j) {
+        const float *dr0 = din_channel + hs * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        if (hs == -1) {
+          dr0 = zero;
+        }
+
+        switch (he - h_in) {
+          case 2:
+            dr2 = zero;
+            doutr1 = trash_buf;
+          case 1:
+            dr3 = zero;
+          default:
+            break;
+        }
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
                       [din2] "+r"(dr2),
@@ -1724,9 +2045,7 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
                       "v15",
                       "v16",
                       "v17");
-        }
 #else
-        if (flag_relu) {
        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
@@ -1752,33 +2071,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
                       "q13",
                       "q14",
                       "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
 #endif
        for (int w = 0; w < w_out; ++w) {
          *doutr0++ = out_buf1[w];
@@ -1797,7 +2089,7 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width > 4
 */
-void conv_depthwise_3x3s1p0_bias_relu(float *dout,
+void conv_depthwise_3x3s1p0_bias_no_relu(float *dout,
                                         const float *din,
                                         const float *weights,
                                         const float *bias,
@@ -1937,67 +2229,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
        }

        int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-              MID_COMPUTE_S1 MID_RESULT_S1_RELU
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1_RELU "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
        asm volatile(
            INIT_S1
            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
@@ -2008,8 +2239,8 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
            MID_COMPUTE_S1 MID_RESULT_S1
            "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1 "0: \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+            "0: \n"
            : [cnt] "+r"(cnt),
              [din_ptr0] "+r"(din_ptr0),
              [din_ptr1] "+r"(din_ptr1),
@@ -2057,7 +2288,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
              "v23",
              "v24",
              "v25");
-        }
        dout_ptr = dout_ptr + 4 * w_out;
      }
 #else
@@ -2096,18 +2326,17 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
        int cnt = tile_w;
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(INIT_S1
+        asm volatile(
+            INIT_S1
            "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
            "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
            "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
            "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
            "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1_RELU
+            "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1
            "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_RELU "0:                         \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+            "0:                         \n"
            : [dout_ptr1] "+r"(doutr0),
              [dout_ptr2] "+r"(doutr1),
              [din0_ptr] "+r"(din_ptr0),
@@ -2137,20 +2366,263 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
              "q13",
              "q14",
              "q15");
-        } else {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1 "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+void conv_depthwise_3x3s1p0_bias_relu(float *dout,
+                                      const float *din,
+                                      const float *weights,
+                                      const float *bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 3 >= h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            case 0:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_RELU "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
                       [din0_ptr] "+r"(din_ptr0),
                       [din1_ptr] "+r"(din_ptr1),
                       [din2_ptr] "+r"(din_ptr2),
@@ -2178,7 +2650,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
                       "q13",
                       "q14",
                       "q15");
-        }
        dout_ptr += 2 * w_out;
      }  //! end of processing mid rows
 #endif
@@ -2189,7 +2660,7 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width <= 4
 */
-void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
+void conv_depthwise_3x3s1p0_bias_s_no_relu(float *dout,
                                           const float *din,
                                           const float *weights,
                                           const float *bias,
@@ -2276,8 +2747,7 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
          }
        }
 #ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
                       [din2] "+r"(dr2),
@@ -2309,8 +2779,134 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
                       "v13",
                       "v14",
                       "v15");
-        } else {
+#else
+        unsigned int *vmask_ptr = vmask;
+        float bias_val = flag_bias ? bias[i] : 0.f;
        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                     : [din0] "+r"(dr0),
+                       [din1] "+r"(dr1),
+                       [din2] "+r"(dr2),
+                       [din3] "+r"(dr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(out_buf1),
+                       [out2] "r"(out_buf2)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
+                                        const float *din,
+                                        const float *weights,
+                                        const float *bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#endif  // __aarch64__
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 3 >= h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            case 0:
+              dr3 = zero_ptr;
+              if (j + 2 > h_out) {
+                doutr1 = trash_buf;
+              }
+            default:
+              break;
+          }
+        }
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
                       [din2] "+r"(dr2),
@@ -2342,11 +2938,9 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
                       "v13",
                       "v14",
                       "v15");
-        }
 #else
        unsigned int *vmask_ptr = vmask;
        float bias_val = flag_bias ? bias[i] : 0.f;
-        if (flag_relu) {
        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
                     : [din0] "+r"(dr0),
                       [din1] "+r"(dr1),
@@ -2374,35 +2968,6 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
                       "q13",
                       "q14",
                       "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
 #endif
        for (int w = 0; w < w_out; ++w) {
          *doutr0++ = out_buf1[w];

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -20,10 +20,11 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
-void conv_depthwise_3x3s2p0_bias(float* dout,
+void conv_depthwise_3x3s2p0_bias_relu6(float* dout,
                                       const float* din,
                                       const float* weights,
                                       const float* bias,
+                                       const float* six,
                                       bool flag_bias,
                                       const int num,
                                       const int ch_in,
@@ -31,13 +32,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                       const int w_in,
                                       const int h_out,
                                       const int w_out,
-                                 const operators::ActivationParam act_param,
                                       ARMContext* ctx);

-void conv_depthwise_3x3s2p0_bias_s(float* dout,
+void conv_depthwise_3x3s2p0_bias_leakyRelu(float* dout,
                                           const float* din,
                                           const float* weights,
                                           const float* bias,
+                                           const float* scale,
                                           bool flag_bias,
                                           const int num,
                                           const int ch_in,
@@ -45,13 +46,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                           const int w_in,
                                           const int h_out,
                                           const int w_out,
-                                   const operators::ActivationParam act_param,
                                           ARMContext* ctx);

-void conv_depthwise_3x3s2p1_bias(float* dout,
+void conv_depthwise_3x3s2p0_bias_s_relu6(float* dout,
                                         const float* din,
                                         const float* weights,
                                         const float* bias,
+                                         const float* six,
                                         bool flag_bias,
                                         const int num,
                                         const int ch_in,
@@ -59,13 +60,69 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                         const int w_in,
                                         const int h_out,
                                         const int w_out,
-                                 const operators::ActivationParam act_param,
                                         ARMContext* ctx);

-void conv_depthwise_3x3s2p1_bias_s(float* dout,
+void conv_depthwise_3x3s2p0_bias_s_leakyRelu(float* dout,
                                             const float* din,
                                             const float* weights,
                                             const float* bias,
+                                             const float* scale,
+                                             bool flag_bias,
+                                             const int num,
+                                             const int ch_in,
+                                             const int h_in,
+                                             const int w_in,
+                                             const int h_out,
+                                             const int w_out,
+                                             ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_relu6(float* dout,
+                                       const float* din,
+                                       const float* weights,
+                                       const float* bias,
+                                       const float* six,
+                                       bool flag_bias,
+                                       const int num,
+                                       const int ch_in,
+                                       const int h_in,
+                                       const int w_in,
+                                       const int h_out,
+                                       const int w_out,
+                                       ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_leakyRelu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           const float* scale,
+                                           bool flag_bias,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s_relu6(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         const float* six,
+                                         bool flag_bias,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s_leakyRelu(float* dout,
+                                             const float* din,
+                                             const float* weights,
+                                             const float* bias,
+                                             const float* scale,
                                             bool flag_bias,
                                             const int num,
                                             const int ch_in,
@@ -73,7 +130,6 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                             const int w_in,
                                             const int h_out,
                                             const int w_out,
-                                   const operators::ActivationParam act_param,
                                             ARMContext* ctx);

 void conv_depthwise_3x3s2_fp32(const float* din,
@@ -92,22 +148,85 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                               const operators::ActivationParam act_param,
                               ARMContext* ctx) {
  bool has_active = act_param.has_active;
-  bool flag_relu = false;
-  bool relu6 = false;
+  auto act_type = act_param.active_type;
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
  if (has_active) {
-    if (act_param.active_type == lite_api::ActivationType::kRelu) {
-      flag_relu = true;
+    switch (act_type) {
+      case lite_api::ActivationType::kRelu:
+        if (pad == 0) {
+          if (w_in > 8) {
+            conv_depthwise_3x3s2p0_bias_relu(dout,
+                                             din,
+                                             weights,
+                                             bias,
+                                             flag_bias,
+                                             true,
+                                             num,
+                                             ch_in,
+                                             h_in,
+                                             w_in,
+                                             h_out,
+                                             w_out,
+                                             ctx);
+          } else {
+            conv_depthwise_3x3s2p0_bias_s_relu(dout,
+                                               din,
+                                               weights,
+                                               bias,
+                                               flag_bias,
+                                               true,
+                                               num,
+                                               ch_in,
+                                               h_in,
+                                               w_in,
+                                               h_out,
+                                               w_out,
+                                               ctx);
+          }
+        }
+        if (pad == 1) {
+          if (w_in > 7) {
+            conv_depthwise_3x3s2p1_bias_relu(dout,
+                                             din,
+                                             weights,
+                                             bias,
+                                             flag_bias,
+                                             true,
+                                             num,
+                                             ch_in,
+                                             h_in,
+                                             w_in,
+                                             h_out,
+                                             w_out,
+                                             ctx);
          } else {
-      relu6 = true;
+            conv_depthwise_3x3s2p1_bias_s_relu(dout,
+                                               din,
+                                               weights,
+                                               bias,
+                                               flag_bias,
+                                               true,
+                                               num,
+                                               ch_in,
+                                               h_in,
+                                               w_in,
+                                               h_out,
+                                               w_out,
+                                               ctx);
          }
        }
+        break;
+      case lite_api::ActivationType::kRelu6:
        if (pad == 0) {
          if (w_in > 8) {
-      if (relu6) {
-        conv_depthwise_3x3s2p0_bias(dout,
+            conv_depthwise_3x3s2p0_bias_relu6(dout,
                                              din,
                                              weights,
                                              bias,
+                                              vsix,
                                              flag_bias,
                                              num,
                                              ch_in,
@@ -115,15 +234,14 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                              w_in,
                                              h_out,
                                              w_out,
-                                    act_param,
                                              ctx);
          } else {
-        conv_depthwise_3x3s2p0_bias_relu(dout,
+            conv_depthwise_3x3s2p0_bias_s_relu6(dout,
                                                din,
                                                weights,
                                                bias,
+                                                vsix,
                                                flag_bias,
-                                         flag_relu,
                                                num,
                                                ch_in,
                                                h_in,
@@ -132,12 +250,14 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                w_out,
                                                ctx);
          }
-    } else {
-      if (relu6) {
-        conv_depthwise_3x3s2p0_bias_s(dout,
+        }
+        if (pad == 1) {
+          if (w_in > 7) {
+            conv_depthwise_3x3s2p1_bias_relu6(dout,
                                              din,
                                              weights,
                                              bias,
+                                              vsix,
                                              flag_bias,
                                              num,
                                              ch_in,
@@ -145,15 +265,14 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                              w_in,
                                              h_out,
                                              w_out,
-                                      act_param,
                                              ctx);
          } else {
-        conv_depthwise_3x3s2p0_bias_s_relu(dout,
+            conv_depthwise_3x3s2p1_bias_s_relu6(dout,
                                                din,
                                                weights,
                                                bias,
+                                                vsix,
                                                flag_bias,
-                                           flag_relu,
                                                num,
                                                ch_in,
                                                h_in,
@@ -163,14 +282,46 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                ctx);
          }
        }
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        if (pad == 0) {
+          if (w_in > 8) {
+            conv_depthwise_3x3s2p0_bias_leakyRelu(dout,
+                                                  din,
+                                                  weights,
+                                                  bias,
+                                                  vscale,
+                                                  flag_bias,
+                                                  num,
+                                                  ch_in,
+                                                  h_in,
+                                                  w_in,
+                                                  h_out,
+                                                  w_out,
+                                                  ctx);
+          } else {
+            conv_depthwise_3x3s2p0_bias_s_leakyRelu(dout,
+                                                    din,
+                                                    weights,
+                                                    bias,
+                                                    vscale,
+                                                    flag_bias,
+                                                    num,
+                                                    ch_in,
+                                                    h_in,
+                                                    w_in,
+                                                    h_out,
+                                                    w_out,
+                                                    ctx);
+          }
        }
        if (pad == 1) {
          if (w_in > 7) {
-      if (relu6) {
-        conv_depthwise_3x3s2p1_bias(dout,
+            conv_depthwise_3x3s2p1_bias_leakyRelu(dout,
                                                  din,
                                                  weights,
                                                  bias,
+                                                  vscale,
                                                  flag_bias,
                                                  num,
                                                  ch_in,
@@ -178,15 +329,14 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                  w_in,
                                                  h_out,
                                                  w_out,
-                                    act_param,
                                                  ctx);
          } else {
-        conv_depthwise_3x3s2p1_bias_relu(dout,
+            conv_depthwise_3x3s2p1_bias_s_leakyRelu(dout,
                                                    din,
                                                    weights,
                                                    bias,
+                                                    vscale,
                                                    flag_bias,
-                                         flag_relu,
                                                    num,
                                                    ch_in,
                                                    h_in,
@@ -195,28 +345,66 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                    w_out,
                                                    ctx);
          }
+        }
+        break;
+      default:
+        LOG(FATAL) << "this act_type: " << static_cast<int>(act_type)
+                   << " fuse not support";
+    }
  } else {
-      if (relu6) {
-        conv_depthwise_3x3s2p1_bias_s(dout,
+    if (pad == 0) {
+      if (w_in > 8) {
+        conv_depthwise_3x3s2p0_bias_no_relu(dout,
                                            din,
                                            weights,
                                            bias,
                                            flag_bias,
+                                            false,
                                            num,
                                            ch_in,
                                            h_in,
                                            w_in,
                                            h_out,
                                            w_out,
-                                      act_param,
                                            ctx);
      } else {
-        conv_depthwise_3x3s2p1_bias_s_relu(dout,
+        conv_depthwise_3x3s2p0_bias_s_no_relu(dout,
+                                              din,
+                                              weights,
+                                              bias,
+                                              flag_bias,
+                                              false,
+                                              num,
+                                              ch_in,
+                                              h_in,
+                                              w_in,
+                                              h_out,
+                                              w_out,
+                                              ctx);
+      }
+    }
+    if (pad == 1) {
+      if (w_in > 7) {
+        conv_depthwise_3x3s2p1_bias_no_relu(dout,
+                                            din,
+                                            weights,
+                                            bias,
+                                            flag_bias,
+                                            false,
+                                            num,
+                                            ch_in,
+                                            h_in,
+                                            w_in,
+                                            h_out,
+                                            w_out,
+                                            ctx);
+      } else {
+        conv_depthwise_3x3s2p1_bias_s_no_relu(dout,
                                              din,
                                              weights,
                                              bias,
                                              flag_bias,
-                                           flag_relu,
+                                              false,
                                              num,
                                              ch_in,
                                              h_in,
@@ -228,6 +416,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
    }
  }
 }
+// clang-format off
 #ifdef __aarch64__
 #define INIT_S2                                  \
  "prfm pldl1keep, [%[inptr0]]             \n"   \
@@ -746,6 +935,18 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "fmax v4.4s, v4.4s, v9.4s                       \n" \
                                                      \
  "st1 {v4.4s}, [%[out]]                          \n"
+#define RESULT_S_S2_RELU6                             \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+  "fmin v4.4s, v4.4s, %[vsix].4s                  \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+#define RESULT_S_S2_LEAKY_RELU                        \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+  "fcmge v11.4s, v4.4s, %[vzero].4s \n"/* vcgeq_u32 */\
+  "fmul v12.4s, v4.4s, %[vscale].4s                \n"\
+  "bif  v4.16b, v12.16b, v11.16b \n" /* choose*/      \
+  "st1 {v4.4s}, [%[out]]                          \n"
 #define COMPUTE_S_S2_P0                                \
  "movi v9.4s, #0                                 \n"  \
  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
@@ -785,6 +986,15 @@ void conv_depthwise_3x3s2_fp32(const float* din,
 #define RESULT_S_S2_P0_RELU                           \
  "fmax v4.4s, v4.4s, v9.4s                       \n" \
  "st1 {v4.4s}, [%[out]]                          \n"
+#define RESULT_S_S2_P0_RELU6                          \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+  "fmin v4.4s, v4.4s, %[vsix].4s                  \n" \
+  "st1 {v4.4s}, [%[out]]                          \n"
+#define RESULT_S_S2_P0_LEAKY_RELU                      \
+  "fcmge v11.4s, v4.4s, %[vzero].4s \n"/* vcgeq_u32 */\
+  "fmul v12.4s, v4.4s, %[vscale].4s                \n"\
+  "bif  v4.16b, v12.16b, v11.16b \n" /* choose*/      \
+  "st1 {v4.4s}, [%[out]]                          \n"

 #else
 #define INIT_S2                                                     \
@@ -822,12 +1032,11 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, out1\n" \
  "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, out1\n" \
                                                                          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"        \
-                                                                          \
-  "vadd.f32 q3, q3, q4                            @ add \n"               \
-  "vadd.f32 q3, q3, q5                            @ add \n"
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"

 #define LEFT_RESULT_S2                                     \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"      \
  "cmp %[cnt], #1                                 \n"      \
  "blt 1f                                         \n"
@@ -860,12 +1069,11 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n" \
  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n" \
                                                                          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"        \
-                                                                          \
-  "vadd.f32 q3, q3, q4                            @ add \n"               \
-  "vadd.f32 q3, q3, q5                            @ add \n"
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"

 #define MID_RESULT_S2                                 \
+  "vadd.f32 q3, q3, q4                       @ add \n"\
+  "vadd.f32 q3, q3, q5                       @ add \n"\
  "subs %[cnt], #1                                \n" \
                                                      \
  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
@@ -910,36 +1118,104 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                                            \
  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"

 #define RIGHT_RESULT_S2                                           \
+  "vadd.f32 q3, q3, q4                            @ add \n"       \
+  "vadd.f32 q3, q3, q5                            @ add \n"       \
  "vbif.f32 q3, q10, q11                          @ write mask\n" \
                                                                  \
  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
  "3:                                             \n"

 #define LEFT_RESULT_S2_RELU                                \
-  "vmax.f32 q3, q3, q9                    @ relu \n"  \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
-  "cmp %[cnt], #1                                 \n" \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
+  "vmax.f32 q3, q3, q9                                  \n"\
+  "cmp %[cnt], #1                                       \n"\
+  "vst1.32 {d6-d7}, [%[outptr]]!                        \n"\
+  "blt 1f                                               \n"
+#define LEFT_RESULT_S2_RELU6                            \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vld1.f32 {d12-d13}, [%[six_ptr]] @ load six       \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vmax.f32 q3, q3, q9                        @ relu \n"\
+  "cmp %[cnt], #1                                    \n"\
+  "vmin.f32 q3, q3, q6                        @ relu \n"\
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
+  "blt 1f                                            \n"
+#define LEFT_RESULT_S2_LEAKY_RELU                       \
+  "vadd.f32 q3, q3, q4                               \n"\
+  "vld1.f32 {d12-d13}, [%[scale_ptr]]                \n"\
+  "vadd.f32 q3, q3, q5                               \n"\
+  "vcge.f32 q7, q3, q9                               \n"\
+  "vmul.f32 q8, q3, q6                               \n"\
+  "cmp %[cnt], #1                                    \n"\
+  "vbif q3, q8, q7 @ choose                          \n"\
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
  "blt 1f                                            \n"
 #define MID_RESULT_S2_RELU                              \
-  "vmax.f32 q3, q3, q9                    @ relu \n"  \
-  "subs %[cnt], #1                                \n" \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "subs %[cnt], #1                                   \n"\
+  "vmax.f32 q3, q3, q9                        @ relu \n"\
                                                        \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
+  "bne  2b                                             \n"
+
+#define MID_RESULT_S2_RELU6                             \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vld1.f32 {d12-d13}, [%[six_ptr]] @ load six       \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vmax.f32 q3, q3, q9                        @ relu \n"\
+  "subs %[cnt], #1                                   \n"\
+  "vmin.f32 q3, q3, q6                        @ relu \n"\
+                                                        \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
+  "bne  2b                                             \n"
+#define MID_RESULT_S2_LEAKY_RELU                       \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vld1.f32 {d12-d13}, [%[scale_ptr]]                \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vcge.f32 q7, q3, q9                               \n"\
+  "vmul.f32 q8, q3, q6                               \n"\
+  "subs %[cnt], #1                                   \n"\
+  "vbif q3, q8, q7 @ choose                          \n"\
+                                                        \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
  "bne  2b                                             \n"

 #define RIGHT_RESULT_S2_RELU                            \
-  "vmax.f32 q3, q3, q9                    @ relu \n"              \
-  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vmax.f32 q3, q3, q9                         @ relu\n"\
+  "vbif.f32 q3, q10, q11                 @ write mask\n"\
                                                        \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
  "3:                                                 \n"

+#define RIGHT_RESULT_S2_RELU6                           \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vld1.f32 {d12-d13}, [%[six_ptr]] @ load six       \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vmax.f32 q3, q3, q9                         @ relu\n"\
+  "vmin.f32 q3, q3, q6                        @ relu \n"\
+                                                        \
+  "vbif.f32 q3, q10, q11                 @ write mask\n"\
+                                                        \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
+  "3:                                                 \n"
+#define RIGHT_RESULT_S2_LEAKY_RELU                      \
+  "vadd.f32 q3, q3, q4                         @ add \n"\
+  "vld1.f32 {d12-d13}, [%[scale_ptr]]                \n"\
+  "vadd.f32 q3, q3, q5                         @ add \n"\
+  "vcge.f32 q7, q3, q9                               \n"\
+  "vmul.f32 q8, q3, q6                               \n"\
+  "vbif q3, q8, q7 @ choose                          \n"\
+  "vbif.f32 q3, q10, q11                 @ write mask\n"\
+                                                        \
+  "vst1.32 {d6-d7}, [%[outptr]]!                     \n"\
+  "3:                                                 \n"
 #define COMPUTE_S_S2                                                        \
  "vmov.u32 q9, #0                                \n"                       \
  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
@@ -976,17 +1252,36 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                                            \
  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, out0\n"   \
  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
+  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"

-#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
+#define RESULT_S_S2                                        \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
 #define RESULT_S_S2_RELU                                   \
-  "vmax.f32 q3, q3, q9                            @ relu\n" \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
+  "vmax.f32 q3, q3, q9                            @ relu\n"\
                                                           \
  "vst1.32 {d6-d7}, [%[out]]                            \n"

+#define RESULT_S_S2_RELU6                                  \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vld1.f32 {d12-d13}, [%[six_ptr]] @ load six          \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
+  "vmax.f32 q3, q3, q9                            @ relu\n"\
+  "vmin.f32 q3, q3, q6                            @ relu\n"\
+                                                           \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+#define RESULT_S_S2_LEAKY_RELU                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"\
+  "vld1.f32 {d12-d13}, [%[scale_ptr]]                   \n"\
+  "vadd.f32 q3, q3, q5                            @ add \n"\
+  "vcge.f32 q7, q3, q9                                  \n"\
+  "vmul.f32 q8, q3, q6                                  \n"\
+  "vbif q3, q8, q7 @ choose                             \n"\
+                                                           \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
 #define COMPUTE_S_S2_P0                                                     \
  "vmov.u32 q9, #0                                \n"                       \
  "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"             \
@@ -1023,198 +1318,44 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                                            \
  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
+  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"

-#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
+#define RESULT_S_S2_P0                                      \
+  "vadd.f32 q3, q3, q4                            @ add \n" \
+  "vadd.f32 q3, q3, q5                            @ add \n" \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
 #define RESULT_S_S2_P0_RELU                                 \
+  "vadd.f32 q3, q3, q4                            @ add \n" \
+  "vadd.f32 q3, q3, q5                            @ add \n" \
  "vmax.f32 q3, q3, q9                            @ relu \n" \
  "vst1.32 {d6-d7}, [%[out]]                             \n"
+#define RESULT_S_S2_P0_RELU6                                \
+  "vadd.f32 q3, q3, q4                            @ add \n" \
+  "vld1.f32 {d12-d13}, [%[six_ptr]] @ load six          \n" \
+  "vadd.f32 q3, q3, q5                            @ add \n" \
+  "vmax.f32 q3, q3, q9                            @ relu\n" \
+  "vmin.f32 q3, q3, q6                            @ relu\n" \
+  "vst1.32 {d6-d7}, [%[out]]                             \n"
+#define RESULT_S_S2_P0_LEAKY_RELU                           \
+  "vadd.f32 q3, q3, q4                            @ add \n" \
+  "vld1.f32 {d12-d13}, [%[scale_ptr]] @ load six        \n" \
+  "vadd.f32 q3, q3, q5                            @ add \n" \
+  "vcge.f32 q7, q3, q9                                  \n" \
+  "vmul.f32 q8, q3, q6                                  \n" \
+  "vbif q3, q8, q7 @ choose                             \n" \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
 #endif
-#ifdef __aarch64__
-void act_switch_3x3s2p1(const float* din0_ptr,
-                        const float* din1_ptr,
-                        const float* din2_ptr,
-                        const float* din3_ptr,
-                        const float* din4_ptr,
-                        float* doutr0_ptr,
-                        float* doutr1_ptr,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        uint32x4_t vmask_rp1,
-                        uint32x4_t vmask_rp2,
-                        uint32x4_t wmask,
-                        float32x4_t wbias,
-                        float32x4_t vzero,
-                        int cnt,
-                        int cnt_remain,
-                        const operators::ActivationParam act_param) {
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
+// clang-format on

-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-      asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                       MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-                   : [inptr0] "+r"(din0_ptr),
-                     [inptr1] "+r"(din1_ptr),
-                     [inptr2] "+r"(din2_ptr),
-                     [inptr3] "+r"(din3_ptr),
-                     [inptr4] "+r"(din4_ptr),
-                     [outptr0] "+r"(doutr0_ptr),
-                     [outptr1] "+r"(doutr1_ptr),
-                     [cnt] "+r"(cnt)
-                   : [vzero] "w"(vzero),
-                     [w0] "w"(wr0),
-                     [w1] "w"(wr1),
-                     [w2] "w"(wr2),
-                     [remain] "r"(cnt_remain),
-                     [mask1] "w"(vmask_rp1),
-                     [mask2] "w"(vmask_rp2),
-                     [wmask] "w"(wmask),
-                     [vbias] "w"(wbias)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17",
-                     "v18",
-                     "v19",
-                     "v20",
-                     "v21");
-      break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
-      asm volatile(
-          INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
-              MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
-          : [inptr0] "+r"(din0_ptr),
-            [inptr1] "+r"(din1_ptr),
-            [inptr2] "+r"(din2_ptr),
-            [inptr3] "+r"(din3_ptr),
-            [inptr4] "+r"(din4_ptr),
-            [outptr0] "+r"(doutr0_ptr),
-            [outptr1] "+r"(doutr1_ptr),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero),
-            [w0] "w"(wr0),
-            [w1] "w"(wr1),
-            [w2] "w"(wr2),
-            [remain] "r"(cnt_remain),
-            [six_ptr] "r"(vsix),
-            [mask1] "w"(vmask_rp1),
-            [mask2] "w"(vmask_rp2),
-            [wmask] "w"(wmask),
-            [vbias] "w"(wbias)
-          : "cc",
-            "memory",
-            "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22");
-      break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
-                       MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU RIGHT_COMPUTE_S2
-                           RIGHT_RESULT_S2_LEAKY_RELU
-                   : [inptr0] "+r"(din0_ptr),
-                     [inptr1] "+r"(din1_ptr),
-                     [inptr2] "+r"(din2_ptr),
-                     [inptr3] "+r"(din3_ptr),
-                     [inptr4] "+r"(din4_ptr),
-                     [outptr0] "+r"(doutr0_ptr),
-                     [outptr1] "+r"(doutr1_ptr),
-                     [cnt] "+r"(cnt)
-                   : [vzero] "w"(vzero),
-                     [w0] "w"(wr0),
-                     [w1] "w"(wr1),
-                     [w2] "w"(wr2),
-                     [remain] "r"(cnt_remain),
-                     [scale_ptr] "r"(vscale),
-                     [mask1] "w"(vmask_rp1),
-                     [mask2] "w"(vmask_rp2),
-                     [wmask] "w"(wmask),
-                     [vbias] "w"(wbias)
-                   : "cc",
-                     "memory",
-                     "v0",
-                     "v1",
-                     "v2",
-                     "v3",
-                     "v4",
-                     "v5",
-                     "v6",
-                     "v7",
-                     "v8",
-                     "v9",
-                     "v10",
-                     "v11",
-                     "v12",
-                     "v13",
-                     "v14",
-                     "v15",
-                     "v16",
-                     "v17",
-                     "v18",
-                     "v19",
-                     "v20",
-                     "v21",
-                     "v22");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
-  }
-}
-#endif
 /**
 * \brief depthwise convolution kernel 3x3, stride 2
 * w_in > 7
 */
-void conv_depthwise_3x3s2p1_bias(float* dout,
+void conv_depthwise_3x3s2p1_bias_relu6(float* dout,
                                       const float* din,
                                       const float* weights,
                                       const float* bias,
+                                       const float* six,
                                       bool flag_bias,
                                       const int num,
                                       const int ch_in,
@@ -1222,7 +1363,6 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                       const int w_in,
                                       const int h_out,
                                       const int w_out,
-                                 const operators::ActivationParam act_param,
                                       ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1350,24 +1490,52 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = cnt_col;
-        act_switch_3x3s2p1(din0_ptr,
-                           din1_ptr,
-                           din2_ptr,
-                           din3_ptr,
-                           din4_ptr,
-                           doutr0_ptr,
-                           doutr1_ptr,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask_rp1,
-                           vmask_rp2,
-                           wmask,
-                           wbias,
-                           vzero,
-                           cnt,
-                           cnt_remain,
-                           act_param);
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(six),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1404,8 +1572,9 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
        }
        int cnt = cnt_col;
        unsigned int* mask_ptr = dmask;
-        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                         MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
            : [din0_ptr] "+r"(din0_ptr),
              [din1_ptr] "+r"(din1_ptr),
              [din2_ptr] "+r"(din2_ptr),
@@ -1416,6 +1585,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
              [wr0] "w"(wr0),
              [wr1] "w"(wr1),
              [wr2] "w"(wr2),
+              [six_ptr] "r"(six),
              [bias] "r"(bias_c)
            : "cc",
              "memory",
@@ -1432,24 +1602,17 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
              "q13",
              "q14",
              "q15");
-        // do act
-        if (act_param.has_active) {
-          act_switch_process(doutr0, doutr0, w_out, &act_param);
-        }
        doutr0 = doutr0 + w_out;
      }
 #endif
    }
  }
 }
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
+void conv_depthwise_3x3s2p1_bias_leakyRelu(float* dout,
                                           const float* din,
                                           const float* weights,
                                           const float* bias,
+                                           const float* scale,
                                           bool flag_bias,
                                           const int num,
                                           const int ch_in,
@@ -1457,23 +1620,41 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                           const int w_in,
                                           const int h_out,
                                           const int w_out,
-                                   const operators::ActivationParam act_param,
                                           ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
+  int size_pad_bottom = h_out * 2 - h_in;

-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+  unsigned int size_right_remain = (unsigned int)(7 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;

+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+  int cnt_col = tile_w - 1;
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
  int size_in_channel = w_in * h_in;
  int size_out_channel = w_out * h_out;

-  unsigned int dmask[8];
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
  vst1q_u32(dmask, vmask_rp1);
  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);

  for (int n = 0; n < num; ++n) {
    const float* din_batch = din + n * ch_in * size_in_channel;
@@ -1488,32 +1669,406 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);

+      float32x4_t vzero = vdupq_n_f32(0.f);
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
      float bias_c = 0.f;
-
      if (flag_bias) {
        bias_c = bias[i];
      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
      const float* dr1 = dr0 + w_in;
      const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
      const float* din0_ptr = dr0;
      const float* din1_ptr = dr1;
      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;

-        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S2 RESULT_S_S2
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          din3_ptr = dr2;
+          din4_ptr = dr3;
+          dr0 = dr3;
+          dr1 = dr4;
+        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 4 > h_in) {
+          switch (i * 2 + 4 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = cnt_col;
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
+                     : [inptr0] "+r"(din0_ptr),
+                       [inptr1] "+r"(din1_ptr),
+                       [inptr2] "+r"(din2_ptr),
+                       [inptr3] "+r"(din3_ptr),
+                       [inptr4] "+r"(din4_ptr),
+                       [outptr0] "+r"(doutr0_ptr),
+                       [outptr1] "+r"(doutr1_ptr),
+                       [cnt] "+r"(cnt)
+                     : [vzero] "w"(vzero),
+                       [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [remain] "r"(cnt_remain),
+                       [scale_ptr] "r"(scale),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [wmask] "w"(wmask),
+                       [vbias] "w"(wbias)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i * 2 + 2 > h_in) {
+          switch (i * 2 + 2 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = cnt_col;
+        unsigned int* mask_ptr = dmask;
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [remain] "r"(cnt_remain),
+                       [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [scale_ptr] "r"(scale),
+                       [bias] "r"(bias_c)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p1_bias_s_relu6(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         const float* six,
+                                         bool flag_bias,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+#ifdef __aarch64__
+  float32x4_t vsix = vld1q_f32(six);
+#endif
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU6
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [vsix] "w"(vsix),
+                       [out] "r"(out_buf)
+                     : "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU6
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
+                       [six_ptr] "r"(six),
+                       [out] "r"(out_buf)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
+  }
+}
+void conv_depthwise_3x3s2p1_bias_s_leakyRelu(float* dout,
+                                             const float* din,
+                                             const float* weights,
+                                             const float* bias,
+                                             const float* scale,
+                                             bool flag_bias,
+                                             const int num,
+                                             const int ch_in,
+                                             const int h_in,
+                                             const int w_in,
+                                             const int h_out,
+                                             const int w_out,
+                                             ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2_LEAKY_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1522,6 +2077,8 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
                       [bias] "w"(vbias),
+                       [vzero] "w"(vzero),
+                       [vscale] "w"(vscale),
                       [out] "r"(out_buf)
                     : "v4",
                       "v5",
@@ -1536,7 +2093,7 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                       "v14",
                       "v15");
 #else
-        asm volatile(COMPUTE_S_S2 RESULT_S_S2
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2_LEAKY_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1545,6 +2102,7 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
                       [bias] "r"(bias_c),
+                       [scale_ptr] "r"(scale),
                       [out] "r"(out_buf)
                     : "cc",
                       "memory",
@@ -1562,10 +2120,6 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                       "q14",
                       "q15");
 #endif
-        // do act
-        if (act_param.has_active) {
-          act_switch_process(out_buf, out_buf, w_out, &act_param);
-        }
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
        }
@@ -1575,90 +2129,141 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
    }
  }
 }
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ */
+// w_in > 7
+void conv_depthwise_3x3s2p0_bias_relu6(float* dout,
+                                       const float* din,
+                                       const float* weights,
+                                       const float* bias,
+                                       const float* six,
+                                       bool flag_bias,
+                                       const int num,
+                                       const int ch_in,
+                                       const int h_in,
+                                       const int w_in,
+                                       const int h_out,
+                                       const int w_out,
+                                       ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+
+  unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
+
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);

 #ifdef __aarch64__
-void act_switch_3x3s2p0(const float* din0_ptr,
-                        const float* din1_ptr,
-                        const float* din2_ptr,
-                        const float* din3_ptr,
-                        const float* din4_ptr,
-                        float* doutr0_ptr,
-                        float* doutr1_ptr,
-                        float32x4_t wr0,
-                        float32x4_t wr1,
-                        float32x4_t wr2,
-                        uint32x4_t vmask_rp1,
-                        uint32x4_t vmask_rp2,
-                        uint32x4_t wmask,
-                        float32x4_t wbias,
-                        float32x4_t vzero,
-                        int cnt,
-                        int cnt_remain,
-                        const operators::ActivationParam act_param) {
-  float tmp = act_param.Relu_clipped_coef;
-  float ss = act_param.Leaky_relu_alpha;
-  float vsix[4] = {tmp, tmp, tmp, tmp};
-  float vscale[4] = {ss, ss, ss, ss};
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__

-  switch (act_param.active_type) {
-    case lite_api::ActivationType::kRelu:
-      asm volatile(
-          INIT_S2
-          "ld1 {v15.4s}, [%[inptr0]]                 \n"
-          "ld1 {v18.4s}, [%[inptr1]]                 \n"
-          "ld1 {v19.4s}, [%[inptr2]]                 \n"
-          "ld1 {v20.4s}, [%[inptr3]]                 \n"
-          "ld1 {v21.4s}, [%[inptr4]]                 \n"
-          "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-          MID_COMPUTE_S2 MID_RESULT_S2_RELU
-          "cmp %w[remain], #1                           \n"
-          "blt 4f                                     \n" RIGHT_COMPUTE_S2
-              RIGHT_RESULT_S2_RELU
-          "4:                                          \n"
-          : [inptr0] "+r"(din0_ptr),
-            [inptr1] "+r"(din1_ptr),
-            [inptr2] "+r"(din2_ptr),
-            [inptr3] "+r"(din3_ptr),
-            [inptr4] "+r"(din4_ptr),
-            [outptr0] "+r"(doutr0_ptr),
-            [outptr1] "+r"(doutr1_ptr),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero),
-            [w0] "w"(wr0),
-            [w1] "w"(wr1),
-            [w2] "w"(wr2),
-            [remain] "r"(cnt_remain),
-            [mask1] "w"(vmask_rp1),
-            [mask2] "w"(vmask_rp2),
-            [wmask] "w"(wmask),
-            [vbias] "w"(wbias)
-          : "cc",
-            "memory",
-            "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21");
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 5 > h_in) {
+          switch (i * 2 + 5 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            case 0:
+              din4_ptr = zero_ptr;
+            default:
              break;
-    case lite_api::ActivationType::kRelu6:
-      /* 0 <= din <= 6 */
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = tile_w;
        asm volatile(
            INIT_S2
            "ld1 {v15.4s}, [%[inptr0]]                 \n"
@@ -1686,7 +2291,7 @@ void act_switch_3x3s2p0(const float* din0_ptr,
              [w1] "w"(wr1),
              [w2] "w"(wr2),
              [remain] "r"(cnt_remain),
-            [six_ptr] "r"(vsix),
+              [six_ptr] "r"(six),
              [mask1] "w"(vmask_rp1),
              [mask2] "w"(vmask_rp2),
              [wmask] "w"(wmask),
@@ -1716,81 +2321,74 @@ void act_switch_3x3s2p0(const float* din0_ptr,
              "v20",
              "v21",
              "v22");
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 3 > h_in) {
+          switch (i * 2 + 3 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
              break;
-    case lite_api::ActivationType::kLeakyRelu:
-      /*din = din >= 0 ? din : din * scale*/
-      asm volatile(
-          INIT_S2
-          "ld1 {v15.4s}, [%[inptr0]]                 \n"
-          "ld1 {v18.4s}, [%[inptr1]]                 \n"
-          "ld1 {v19.4s}, [%[inptr2]]                 \n"
-          "ld1 {v20.4s}, [%[inptr3]]                 \n"
-          "ld1 {v21.4s}, [%[inptr4]]                 \n"
-          "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-          "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
-              MID_RESULT_S2_LEAKY_RELU
-          "cmp %w[remain], #1                           \n"
-          "blt 4f                                     \n" RIGHT_COMPUTE_S2
-              RIGHT_RESULT_S2_LEAKY_RELU
-          "4:                                          \n"
-          : [inptr0] "+r"(din0_ptr),
-            [inptr1] "+r"(din1_ptr),
-            [inptr2] "+r"(din2_ptr),
-            [inptr3] "+r"(din3_ptr),
-            [inptr4] "+r"(din4_ptr),
-            [outptr0] "+r"(doutr0_ptr),
-            [outptr1] "+r"(doutr1_ptr),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero),
-            [w0] "w"(wr0),
-            [w1] "w"(wr1),
-            [w2] "w"(wr2),
-            [remain] "r"(cnt_remain),
-            [scale_ptr] "r"(vscale),
-            [mask1] "w"(vmask_rp1),
-            [mask2] "w"(vmask_rp2),
-            [wmask] "w"(wmask),
-            [vbias] "w"(wbias)
+          }
+        }
+        int cnt = tile_w;
+        unsigned int* mask_ptr = dmask;
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2
+                         RIGHT_RESULT_S2_RELU6
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [remain] "r"(cnt_remain),
+                       [six_ptr] "r"(six),
+                       [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c)
                     : "cc",
                       "memory",
-            "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22");
-      break;
-    default:
-      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
-                 << " fuse not support";
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        doutr0 = doutr0 + w_out;
      }
-}
 #endif
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias(float* dout,
+    }
+  }
+}
+
+void conv_depthwise_3x3s2p0_bias_leakyRelu(float* dout,
                                           const float* din,
                                           const float* weights,
                                           const float* bias,
+                                           const float* scale,
                                           bool flag_bias,
                                           const int num,
                                           const int ch_in,
@@ -1798,7 +2396,6 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                           const int w_in,
                                           const int h_out,
                                           const int w_out,
-                                 const operators::ActivationParam act_param,
                                           ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1918,24 +2515,63 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = tile_w;
-        act_switch_3x3s2p0(din0_ptr,
-                           din1_ptr,
-                           din2_ptr,
-                           din3_ptr,
-                           din4_ptr,
-                           doutr0_ptr,
-                           doutr1_ptr,
-                           wr0,
-                           wr1,
-                           wr2,
-                           vmask_rp1,
-                           vmask_rp2,
-                           wmask,
-                           wbias,
-                           vzero,
-                           cnt,
-                           cnt_remain,
-                           act_param);
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
+                MID_RESULT_S2_LEAKY_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_LEAKY_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [scale_ptr] "r"(scale),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1963,8 +2599,8 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
        }
        int cnt = tile_w;
        unsigned int* mask_ptr = dmask;
-        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-                         RIGHT_RESULT_S2
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+                         RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1972,6 +2608,7 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                       [cnt] "+r"(cnt),
                       [mask_ptr] "+r"(mask_ptr)
                     : [remain] "r"(cnt_remain),
+                       [scale_ptr] "r"(scale),
                       [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
@@ -1991,9 +2628,6 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                       "q13",
                       "q14",
                       "q15");
-        if (act_param.has_active) {
-          act_switch_process(doutr0, doutr0, w_out, &act_param);
-        }
        doutr0 = doutr0 + w_out;
      }
 #endif
@@ -2004,10 +2638,11 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
 /**
 * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
 */
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
+void conv_depthwise_3x3s2p0_bias_s_relu6(float* dout,
                                         const float* din,
                                         const float* weights,
                                         const float* bias,
+                                         const float* six,
                                         bool flag_bias,
                                         const int num,
                                         const int ch_in,
@@ -2015,7 +2650,6 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                         const int w_in,
                                         const int h_out,
                                         const int w_out,
-                                   const operators::ActivationParam act_param,
                                         ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -2033,6 +2667,10 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
  unsigned int dmask[8];
  vst1q_u32(dmask, vmask_rp1);
  vst1q_u32(dmask + 4, vmask_rp2);
+#ifdef __aarch64__
+  float32x4_t vsix = vld1q_f32(six);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif

  for (int n = 0; n < num; ++n) {
    const float* din_batch = din + n * ch_in * size_in_channel;
@@ -2077,7 +2715,7 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,

        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU6
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -2086,6 +2724,8 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
                       [bias] "w"(vbias),
+                       [vzero] "w"(vzero),
+                       [vsix] "w"(vsix),
                       [out] "r"(out_buf)
                     : "cc",
                       "memory",
@@ -2104,7 +2744,7 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                       "v16");

 #else
-        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU6
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr)
@@ -2113,6 +2753,7 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                       [wr2] "w"(wr2),
                       [bias] "r"(bias_c),
                       [out] "r"(out_buf),
+                       [six_ptr] "r"(six),
                       [mask_ptr] "r"(dmask)
                     : "cc",
                       "memory",
@@ -2130,9 +2771,145 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                       "q14",
                       "q15");
 #endif
-        if (act_param.has_active) {
-          act_switch_process(out_buf, out_buf, w_out, &act_param);
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+      }
+    }
+  }
+}
+void conv_depthwise_3x3s2p0_bias_s_leakyRelu(float* dout,
+                                             const float* din,
+                                             const float* weights,
+                                             const float* bias,
+                                             const float* scale,
+                                             bool flag_bias,
+                                             const int num,
+                                             const int ch_in,
+                                             const int h_in,
+                                             const int w_in,
+                                             const int h_out,
+                                             const int w_out,
+                                             ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+#ifdef __aarch64__
+  float32x4_t vscale = vld1q_f32(scale);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      float out_buf[4];
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      for (int j = 0; j < h_out; j++) {
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+        if (j * 2 + 2 >= h_in) {
+          switch (j + 2 - h_in) {
+            case 1:
+              din1_ptr = zero_ptr;
+            case 0:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
        }
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_LEAKY_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [vzero] "w"(vzero),
+                       [vscale] "w"(vscale),
+                       [out] "r"(out_buf)
+                     : "cc",
+                       "memory",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16");
+#else
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_LEAKY_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
+                       [out] "r"(out_buf),
+                       [scale_ptr] "r"(scale),
+                       [mask_ptr] "r"(dmask)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
        }

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
@@ -20,6 +20,7 @@ namespace lite {
 namespace arm {
 namespace math {

+// clang-format off
 #ifdef __aarch64__
 #define INIT_S2                                  \
  "prfm pldl1keep, [%[inptr0]]             \n"   \
@@ -683,6 +684,7 @@ namespace math {
  "vst1.32 {d6-d7}, [%[out]]                            \n"

 #endif
+// clang-format on

 /**
 * \brief depthwise convolution kernel 3x3, stride 2
@@ -825,7 +827,6 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = cnt_col;
-        if (flag_relu) {
        asm volatile(
            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
                MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
@@ -870,7 +871,215 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
              "v19",
              "v20",
              "v21");
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i + 2 > h_in) {
+          switch (i + 2 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = cnt_col;
+        unsigned int* mask_ptr = dmask;
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+            : [din0_ptr] "+r"(din0_ptr),
+              [din1_ptr] "+r"(din1_ptr),
+              [din2_ptr] "+r"(din2_ptr),
+              [outptr] "+r"(doutr0_ptr),
+              [cnt] "+r"(cnt),
+              [mask_ptr] "+r"(mask_ptr)
+            : [remain] "r"(cnt_remain),
+              [wr0] "w"(wr0),
+              [wr1] "w"(wr1),
+              [wr2] "w"(wr2),
+              [bias] "r"(bias_c)
+            : "cc",
+              "memory",
+              "q3",
+              "q4",
+              "q5",
+              "q6",
+              "q7",
+              "q8",
+              "q9",
+              "q10",
+              "q11",
+              "q12",
+              "q13",
+              "q14",
+              "q15");
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  int size_pad_bottom = h_out * 2 - h_in;
+
+  int cnt_col = (w_out >> 2) - 2;
+  int size_right_remain = w_in - (7 + cnt_col * 8);
+  if (size_right_remain >= 9) {
+    cnt_col++;
+    size_right_remain -= 8;
+  }
+  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+
+  int size_right_pad = w_out * 2 - w_in;
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          din3_ptr = dr2;
+          din4_ptr = dr3;
+          dr0 = dr3;
+          dr1 = dr4;
        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i / 2 + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = cnt_col;
        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
                         MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
                     : [inptr0] "+r"(din0_ptr),
@@ -914,7 +1123,6 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
                       "v19",
                       "v20",
                       "v21");
-        }
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -951,37 +1159,6 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
        }
        int cnt = cnt_col;
        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-              : [din0_ptr] "+r"(din0_ptr),
-                [din1_ptr] "+r"(din1_ptr),
-                [din2_ptr] "+r"(din2_ptr),
-                [outptr] "+r"(doutr0_ptr),
-                [cnt] "+r"(cnt),
-                [mask_ptr] "+r"(mask_ptr)
-              : [remain] "r"(cnt_remain),
-                [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias] "r"(bias_c)
-              : "cc",
-                "memory",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
                         MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
                     : [din0_ptr] "+r"(din0_ptr),
@@ -1010,7 +1187,6 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
-        }
        doutr0 = doutr0 + w_out;
      }
 #endif
@@ -1088,7 +1264,6 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,

        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
        asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
@@ -1111,8 +1286,8 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                       "v13",
                       "v14",
                       "v15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+#else
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1120,24 +1295,124 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                     : [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
+                       [bias] "r"(bias_c),
                       [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
  }
+}
+void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [out] "r"(out_buf)
+                     : "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
 #else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1162,17 +1437,245 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ */
+// w_in > 7
+void conv_depthwise_3x3s2p0_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+
+  unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
+
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
      } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 5 > h_in) {
+          switch (i * 2 + 5 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            case 0:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = tile_w;
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            MID_COMPUTE_S2 MID_RESULT_S2_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21");
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 3 > h_in) {
+          switch (i * 2 + 3 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = tile_w;
+        unsigned int* mask_ptr = dmask;
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU RIGHT_COMPUTE_S2
+                         RIGHT_RESULT_S2_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
                       [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
+                     : [remain] "r"(cnt_remain),
+                       [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
+                       [bias] "r"(bias_c)
                     : "cc",
                       "memory",
                       "q3",
@@ -1188,23 +1691,13 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
+        doutr0 = doutr0 + w_out;
      }
 #endif
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
    }
  }
 }
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
+void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
                                         const float* din,
                                         const float* weights,
                                         const float* bias,
@@ -1334,62 +1827,6 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2_RELU
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2_RELU
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
        asm volatile(
            INIT_S2
            "ld1 {v15.4s}, [%[inptr0]]                 \n"
@@ -1401,8 +1838,7 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
            MID_COMPUTE_S2 MID_RESULT_S2
            "cmp %w[remain], #1                           \n"
            "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2
-              "4:                                          \n"
+                RIGHT_RESULT_S2 "4:                                          \n"
            : [inptr0] "+r"(din0_ptr),
              [inptr1] "+r"(din1_ptr),
              [inptr2] "+r"(din2_ptr),
@@ -1444,7 +1880,6 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
              "v19",
              "v20",
              "v21");
-        }
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1472,36 +1907,6 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
        }
        int cnt = tile_w;
        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
-                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
                         RIGHT_RESULT_S2
                     : [din0_ptr] "+r"(din0_ptr),
@@ -1530,14 +1935,12 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
-        }
        doutr0 = doutr0 + w_out;
      }
 #endif
    }
  }
 }
-
 /**
 * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
 */
@@ -1614,7 +2017,6 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,

        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
@@ -1640,35 +2042,7 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
                       "v14",
                       "v15",
                       "v16");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        }
 #else
-        if (flag_relu) {
        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
@@ -1694,7 +2068,113 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
-        } else {
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+      }
+    }
+  }
+}
+void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      float out_buf[4];
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      for (int j = 0; j < h_out; j++) {
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+        if (j * 2 + 2 >= h_in) {
+          switch (j + 2 - h_in) {
+            case 1:
+              din1_ptr = zero_ptr;
+            case 0:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [out] "r"(out_buf)
+                     : "cc",
+                       "memory",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16");
+#else
        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
@@ -1720,7 +2200,6 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
                       "q13",
                       "q14",
                       "q15");
-        }
 #endif
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];

--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -323,6 +323,118 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                                        const int w_out,
                                        ARMContext* ctx);

+void conv_depthwise_3x3s1p0_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s1p0_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+
+void conv_depthwise_3x3s1p1_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s1p1_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite