[lite][arm]add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop (#2618)

* fix conv 2-pad to 4-pad * fix compute conv shape * fix pad, test=develop * change conv_depthwise_3x3s1_fp.cc name to conv3x3s1p01_depthwise_fp32.cc to distinguish between conv3x3s1_depthwise_fp32.cc * delete printf note in conv3x3s1, test=develop * delete printf note, test=develop * delete gem_sdot.h, test=develop it is coped from __gemm_sdot_meta_.h * update compute padding, test=develop * fix padding size, must be 2 or 4. test=develop * fix format in operators/conv_op.cc, test=develop * change #if 0 to #if 1, test=develop * put 2-pad to 4-pad in AttachImpl, test=develop * fix clang-format error inn tests/math/connv_compute_test, test=develop * fix x86 test result error, test=develop * add asymmetric padding test case in liite/tests/math/conv_compute.cc, test=develop * change paddings type to support dynamically modify, test=develop * fix x86 build error in connv_compute_test, test=develop * fix opencl build error, test=develop * fix oopencl build error, test=develop * fix opencl/conv_compute build error, test=develop * fix opencl/conv_compute build error, test=develop * fix format in kernels/opencl/conv_computte_ttest,test=develop * fix build error, test=develop fix build error in kernels/x86/conv_compute.h * fix ccompute shape error in ooperators/conv_op.h, test=develop * add conv_reelu6 and conv leaky_relu in conv_3x3s1_direct * add conv_relu6 in c1, c2, c4,test=develop * fix conflict in conv_bloock_utils.h, test=develop * add relu6 and leankyrelu in conv_3x3s1_dw * add conv_3x3s1px_dw relu6 and leaky_relu fusion, test=develop * fix conflict in tests/math/conv_compute_arm, test=develop * fix build error in winograd arm, test=develop * channge act_param as pointer in conv_block_tuils.h, test=develop * fix winograd in no equal 4-padding compute error, test=develop * add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop * fix format, test=develop * fix format in conv_block_utils, test=develop * move updatePadding from conv_op.cc to conv_op.h, test=develop * fix format conv_op.h, test=develop * fix buuilde error in conv_oop.h, test=develop * remove flag_relu parameter in conv_3x3_depthwise, test=develop

[lite][arm]add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop (#2618)
* fix conv 2-pad to 4-pad * fix compute conv shape * fix pad, test=develop * change conv_depthwise_3x3s1_fp.cc name to conv3x3s1p01_depthwise_fp32.cc to distinguish between conv3x3s1_depthwise_fp32.cc * delete printf note in conv3x3s1, test=develop * delete printf note, test=develop * delete gem_sdot.h, test=develop it is coped from __gemm_sdot_meta_.h * update compute padding, test=develop * fix padding size, must be 2 or 4. test=develop * fix format in operators/conv_op.cc, test=develop * change #if 0 to #if 1, test=develop * put 2-pad to 4-pad in AttachImpl, test=develop * fix clang-format error inn tests/math/connv_compute_test, test=develop * fix x86 test result error, test=develop * add asymmetric padding test case in liite/tests/math/conv_compute.cc, test=develop * change paddings type to support dynamically modify, test=develop * fix x86 build error in connv_compute_test, test=develop * fix opencl build error, test=develop * fix oopencl build error, test=develop * fix opencl/conv_compute build error, test=develop * fix opencl/conv_compute build error, test=develop * fix format in kernels/opencl/conv_computte_ttest,test=develop * fix build error, test=develop fix build error in kernels/x86/conv_compute.h * fix ccompute shape error in ooperators/conv_op.h, test=develop * add conv_reelu6 and conv leaky_relu in conv_3x3s1_direct * add conv_relu6 in c1, c2, c4,test=develop * fix conflict in conv_bloock_utils.h, test=develop * add relu6 and leankyrelu in conv_3x3s1_dw * add conv_3x3s1px_dw relu6 and leaky_relu fusion, test=develop * fix conflict in tests/math/conv_compute_arm, test=develop * fix build error in winograd arm, test=develop * channge act_param as pointer in conv_block_tuils.h, test=develop * fix winograd in no equal 4-padding compute error, test=develop * add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop * fix format, test=develop * fix format in conv_block_utils, test=develop * move updatePadding from conv_op.cc to conv_op.h, test=develop * fix format conv_op.h, test=develop * fix buuilde error in conv_oop.h, test=develop * remove flag_relu parameter in conv_3x3_depthwise, test=develop
d4739621 · HappyAngel · GitHub · 1b74fded · d4739621 · d4739621
9 changed file
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -836,7 +836,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
  ctx->ExtendWorkspace(sizeof(float) * workspace_size);

-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;

  /// get workspace

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -2151,6 +2151,210 @@ inline void act_switch_c8_fp32(const float* din_ptr,
  }
 }

+#ifdef __aarch64__
+#define LOAD_DATA                                               \
+  "1:                               \n"                         \
+  "ld1 {v0.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v1.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v2.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v3.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/
+#define DO_RELU                                           \
+  "fmax v0.4s, v0.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v1.4s, v1.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v2.4s, v2.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v3.4s, v3.4s, %[vzero].4s   \n" /* vmaxq_f32() */
+#define DO_RELU6                                         \
+  "fmin v0.4s, v0.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v1.4s, v1.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v2.4s, v2.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v3.4s, v3.4s, %[vsix].4s   \n" /* vmaxq_f32() */
+#define DO_LEAKY_RELU                                     \
+  "cmhs v4.4s, v0.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v5.4s, v0.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v6.4s, v1.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v7.4s, v1.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v8.4s, v2.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v9.4s, v2.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v10.4s, v3.4s,  %[vzero].4s  \n"  /* vcgeq_u32 */ \
+  "fmul v11.4s, v3.4s, %[vscale].4s \n"   /* vmulq_f32 */ \
+  "bif v0.16b, v5.16b, v4.16b       \n"   /* choose*/     \
+  "bif v1.16b, v7.16b, v6.16b       \n"   /* choose*/     \
+  "bif v2.16b, v9.16b, v8.16b       \n"   /* choose*/     \
+  "bif v3.16b, v11.16b, v10.16b       \n" /* choose*/
+#define DO_STORE                                         \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+  "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "bne  1b                                    \n"
+#else
+#define LOAD_DATA                                            \
+  "1:                               \n"                      \
+  "vld1.32 {d6-d7}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d8-d9}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n"
+#define DO_RELU                                 \
+  "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
+#define DO_RELU6                               \
+  "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
+  "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
+#define DO_LEAKY_RELU                            \
+  "vcge.f32 q7, q3, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q8, q3, %q[vscale]  @ vmulq_f32 \n"  \
+  "vcge.f32 q9, q4, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q10, q4, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q11, q5, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q12, q5, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q13, q6, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q14, q6, %q[vscale]  @ vmulq_f32 \n" \
+  "vbif q3, q8, q7               @ choose \n"    \
+  "vbif q4, q10, q9              @ choose \n"    \
+  "vbif q5, q12, q11             @ choose \n"    \
+  "vbif q6, q13, q13             @ choose \n"
+#define DO_STORE                                            \
+  "subs %[cnt], #1                                \n"       \
+  "vst1.32 {d6-d7}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d8-d9}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d10-d11}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "vst1.32 {d12-d13}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "bne  1b                                    \n"
+#endif
+/*
+* Data do activation process
+* Now support relu relu6 leakyrelu act
+*/
+inline void act_switch_process(float* src,
+                               float* dst,
+                               int size,
+                               const operators::ActivationParam* act_param) {
+  int cnt = size >> 4;
+  int remain = size % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    if (cnt > 0) {
+      switch (act_param->active_type) {
+        case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11");
+#else
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14");
+#endif
+          break;
+        default:
+          LOG(FATAL) << "this act_type: "
+                     << static_cast<int>(act_param->active_type)
+                     << " fuse not support";
+      }
+    }
+    // remain
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+        for (int i = 0; i < remain; i++) {
+          *dst = *src >= 0.f ? *src : 0.f;
+          src++;
+          dst++;
+        }
+      case lite_api::ActivationType::kRelu6:
+        for (int i = 0; i < remain; i++) {
+          float tmp = *src >= 0.f ? *src : 0.f;
+          *dst = tmp <= act_param->Relu_clipped_coef
+                     ? tmp
+                     : act_param->Relu_clipped_coef;
+          src++;
+          dst++;
+        }
+      case lite_api::ActivationType::kLeakyRelu:
+        for (int i = 0; i < remain; i++) {
+          if (*src >= 0.f) {
+            *dst = *src;
+          } else {
+            *dst = *src * act_param->Leaky_relu_alpha;
+          }
+          src++;
+          dst++;
+        }
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  }
+}
+
 /*wirte result in outputs
 * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
 */

--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -52,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
                               const float* weights,
                               const float* bias,
                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);

 void conv_depthwise_3x3s1_fp32(const float* din,
@@ -67,7 +68,6 @@ void conv_depthwise_3x3s1_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
                               const operators::ActivationParam act_param,
                               ARMContext* ctx);

@@ -84,7 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);

 template <typename Dtype>

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -584,7 +584,6 @@ void conv_depthwise_3x3_fp32(const void* din,
  const int pad_w = paddings[2];
  int stride = param.strides[1];
  int pad = pad_w;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  bool pads_equal =
      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
@@ -603,7 +602,6 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
                                act_param,
                                ctx);
    } else {
@@ -638,7 +636,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
+                                act_param,
                                ctx);
    } else {
      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
@@ -653,6 +651,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                reinterpret_cast<const float*>(weights),
                                bias,
                                param,
+                                act_param,
                                ctx);
    }
  } else {

--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -52,7 +52,7 @@ inline int ConvOutputSize(int input_size,
  return output_size;
 }

-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
                              std::vector<int>* dilations,
                              const std::vector<int>& strides,
                              const std::string padding_algorithm,

--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -136,7 +136,13 @@ class ConvOpLite : public OpLite {
  mutable ConvParam param_;
  std::string padding_algorithm_{""};
 };
-
+// update padding dilation
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                              std::vector<int>* dilations,
+                              const std::vector<int>& strides,
+                              const std::string padding_algorithm,
+                              const lite::DDim data_dims,
+                              const lite::DDim& ksize);
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle