From 789accae631ddb51f45d36bea8b027b7ce956eec Mon Sep 17 00:00:00 2001 From: HappyAngel Date: Tue, 14 Jan 2020 18:24:17 +0800 Subject: [PATCH] [arm]add gemm + relu6/leakyrelu fusion (#2674) add gemm + relu6/leakyrelu fusion --- .../arm/math/conv3x3s1p01_depthwise_fp32.cc | 110 ++-- lite/backends/arm/math/conv_block_utils.h | 4 +- lite/backends/arm/math/conv_impl.cc | 8 +- lite/backends/arm/math/conv_winograd_3x3.cc | 4 +- lite/backends/arm/math/fill_bias_relu.cc | 236 +++++++- lite/backends/arm/math/fill_bias_relu.h | 17 +- lite/backends/arm/math/gru_utils.h | 6 +- lite/backends/arm/math/packed_sgemm.cc | 121 ++-- lite/backends/arm/math/packed_sgemm.h | 3 +- lite/backends/arm/math/sgemm.cc | 4 +- lite/backends/arm/math/sgemm.h | 2 +- lite/kernels/arm/conv_transpose_compute.cc | 7 +- lite/kernels/arm/fc_compute.cc | 4 +- lite/kernels/arm/matmul_compute.cc | 12 +- lite/kernels/arm/mul_compute.cc | 4 +- lite/tests/kernels/CMakeLists.txt | 2 - .../kernels/conv2d_transpose_compute_test.cc | 535 ------------------ .../tests/math/conv_transpose_compute_test.cc | 8 + lite/tests/math/sgemm_compute_test.cc | 12 +- 19 files changed, 405 insertions(+), 694 deletions(-) delete mode 100644 lite/tests/kernels/conv2d_transpose_compute_test.cc diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc index ddb3675faf..6ea9c4dcdb 100644 --- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc @@ -924,58 +924,58 @@ void conv_depthwise_3x3s1_fp32(const float *din, \ "st1 {v15.4s}, [%[doutr3]], #16 \n" -#define RIGHT_RESULT_S1_RELU6 \ - "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "bif v12.16b, v22.16b, v18.16b \n" \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ - "bif v13.16b, v23.16b, v18.16b \n" \ - \ - "fmla v15.4s , v10.4s, v20.s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmax v14.4s, v14.4s, v20.4s \n" /*relu*/ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "bif v14.16b, v24.16b, v18.16b \n" \ - "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/ \ - "bif v15.16b, v25.16b, v18.16b \n" \ - \ +#define RIGHT_RESULT_S1_RELU6 \ + "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "bif v12.16b, v22.16b, v18.16b \n" \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v14.4s, v14.4s, v20.4s \n" /*relu*/ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/ \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ "st1 {v15.4s}, [%[doutr3]], #16 \n" #define RIGHT_RESULT_S1_LEAKY_RELU \ @@ -1586,7 +1586,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, /* r3 */ \ "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ \ - "vld1.32 {d28-d29}, [%[six_ptr]]! @ load din r0\n" \ + "vld1.32 {d28-d29}, [%[six_ptr]] @ load din r0\n" \ "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ \ "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ @@ -1617,7 +1617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, /* r3 */ \ "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ \ - "vld1.32 {d28-d29}, [%[scale_ptr]]! @ load din r0\n" \ + "vld1.32 {d28-d29}, [%[scale_ptr]] @ load din r0\n" \ \ "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ \ @@ -1694,7 +1694,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, /* r3 */ \ "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ \ - "vld1.32 {d28-d29}, [%[scale_ptr]]! @ load din r0\n" \ + "vld1.32 {d28-d29}, [%[scale_ptr]] @ load din r0\n" \ \ "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ \ diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index e148bb6928..240679898f 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -2237,7 +2237,7 @@ inline void act_switch_process(float* src, int cnt = size >> 4; int remain = size % 16; float32x4_t vzero = vdupq_n_f32(0.f); - if (act_param != nullptr && act_param->has_active) { + if (act_param != nullptr) { float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef); float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha); if (cnt > 0) { @@ -2327,6 +2327,7 @@ inline void act_switch_process(float* src, src++; dst++; } + break; case lite_api::ActivationType::kRelu6: for (int i = 0; i < remain; i++) { float tmp = *src >= 0.f ? *src : 0.f; @@ -2336,6 +2337,7 @@ inline void act_switch_process(float* src, src++; dst++; } + break; case lite_api::ActivationType::kLeakyRelu: for (int i = 0; i < remain; i++) { if (*src >= 0.f) { diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 9156cd162f..c327c715c5 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -180,6 +180,8 @@ void conv1x1s1_gemm(const float* i_data, bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; + auto act_param = param.activation_param; + int hblock = get_hblock(ctx); int m_roundup = hblock * ((m + hblock - 1) / hblock); int weights_size_per_group = m * k; @@ -223,7 +225,7 @@ void conv1x1s1_gemm(const float* i_data, n, bias_group, flag_bias, - flag_relu, + act_param, ctx); } } @@ -361,6 +363,8 @@ void conv_im2col_gemm(const float* i_data, int hblock = get_hblock(ctx); int m_roundup = hblock * ((m + hblock - 1) / hblock); int weights_size_per_group = m * k; + + auto act_param = param.activation_param; if (n > 1) { weights_size_per_group = ((m_roundup * k + 15) / 16) * 16; } @@ -422,7 +426,7 @@ void conv_im2col_gemm(const float* i_data, n, bias_group, flag_bias, - flag_relu, + act_param, ctx); } } diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc index 894b946a32..449c9e51db 100644 --- a/lite/backends/arm/math/conv_winograd_3x3.cc +++ b/lite/backends/arm/math/conv_winograd_3x3.cc @@ -44,6 +44,8 @@ void conv_winograd3x3(const float* din, int size_out_channel = wout * hout; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; + auto act_param = param.activation_param; + act_param.has_active = false; //! transform input int tile_w = (wout + 5) / 6; @@ -127,7 +129,7 @@ void conv_winograd3x3(const float* din, size_tile, nullptr, false, - false, + act_param, ctx); } diff --git a/lite/backends/arm/math/fill_bias_relu.cc b/lite/backends/arm/math/fill_bias_relu.cc index 7137a0363b..c585548bf1 100644 --- a/lite/backends/arm/math/fill_bias_relu.cc +++ b/lite/backends/arm/math/fill_bias_relu.cc @@ -115,7 +115,241 @@ void fill_bias_relu(int* tensor, } } } - +#ifdef __aarch64__ +#define FILL_BIAS \ + "1: \n" \ + "ld1 {v0.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v1.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "add v0.4s, v0.4s, %[vbias].4s \n" \ + "add v1.4s, v1.4s, %[vbias].4s \n" \ + "add v2.4s, v2.4s, %[vbias].4s \n" \ + "add v3.4s, v3.4s, %[vbias].4s \n" +#define FILL_RELU \ + "fmax v0.4s, v0.4s, %[vzero].4s \n" /* vmaxq_f32() */ \ + "fmax v1.4s, v1.4s, %[vzero].4s \n" /* vmaxq_f32() */ \ + "fmax v2.4s, v2.4s, %[vzero].4s \n" /* vmaxq_f32() */ \ + "fmax v3.4s, v3.4s, %[vzero].4s \n" /* vmaxq_f32() */ +#define FILL_RELU6 \ + "fmin v0.4s, v0.4s, %[vsix].4s \n" /* vmaxq_f32() */ \ + "fmin v1.4s, v1.4s, %[vsix].4s \n" /* vmaxq_f32() */ \ + "fmin v2.4s, v2.4s, %[vsix].4s \n" /* vmaxq_f32() */ \ + "fmin v3.4s, v3.4s, %[vsix].4s \n" /* vmaxq_f32() */ +#define FILL_LEAKY_RELU \ + "cmhs v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ + "fmul v5.4s, v0.4s, %[vscale].4s \n" /* vmulq_f32 */ \ + "cmhs v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ + "fmul v7.4s, v1.4s, %[vscale].4s \n" /* vmulq_f32 */ \ + "cmhs v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ + "fmul v9.4s, v2.4s, %[vscale].4s \n" /* vmulq_f32 */ \ + "cmhs v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ + "fmul v11.4s, v3.4s, %[vscale].4s \n" /* vmulq_f32 */ \ + "bif v0.16b, v5.16b, v4.16b \n" /* choose*/ \ + "bif v1.16b, v7.16b, v6.16b \n" /* choose*/ \ + "bif v2.16b, v9.16b, v8.16b \n" /* choose*/ \ + "bif v3.16b, v11.16b, v10.16b \n" /* choose*/ +#define FILL_STORE \ + "subs %w[cnt], %w[cnt], #1 \n" \ + "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \ + "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \ + "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \ + "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \ + "bne 1b \n" +#else +#define FILL_BIAS \ + "1: \n" \ + "vld1.32 {d6-d7}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \ + "vld1.32 {d8-d9}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \ + "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \ + "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \ + "vadd.f32 q3, q3, %q[vbias] @ add \n" \ + "vadd.f32 q4, q4, %q[vbias] @ add \n" \ + "vadd.f32 q5, q5, %q[vbias] @ add \n" \ + "vadd.f32 q6, q6, %q[vbias] @ add \n" +#define FILL_RELU \ + "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \ + "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \ + "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \ + "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n" +#define FILL_RELU6 \ + "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \ + "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \ + "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \ + "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n" +#define FILL_LEAKY_RELU \ + "vcge.f32 q7, q3, %q[vzero] @ vcgeq_u32 \n" \ + "vmul.f32 q8, q3, %q[vscale] @ vmulq_f32 \n" \ + "vcge.f32 q9, q4, %q[vzero] @ vcgeq_u32 \n" \ + "vmul.f32 q10, q4, %q[vscale] @ vmulq_f32 \n" \ + "vcge.f32 q11, q5, %q[vzero] @ vcgeq_u32 \n" \ + "vmul.f32 q12, q5, %q[vscale] @ vmulq_f32 \n" \ + "vcge.f32 q13, q6, %q[vzero] @ vcgeq_u32 \n" \ + "vmul.f32 q14, q6, %q[vscale] @ vmulq_f32 \n" \ + "vbif q3, q8, q7 @ choose \n" \ + "vbif q4, q10, q9 @ choose \n" \ + "vbif q5, q12, q11 @ choose \n" \ + "vbif q6, q14, q13 @ choose \n" +#define FILL_STORE \ + "subs %[cnt], #1 \n" \ + "vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \ + "vst1.32 {d8-d9}, [%[dout_ptr]]! @ vst1q_f32() \n" \ + "vst1.32 {d10-d11}, [%[dout_ptr]]! @ vst1q_f32() \n" \ + "vst1.32 {d12-d13}, [%[dout_ptr]]! @ vst1q_f32() \n" \ + "bne 1b \n" +#endif +template <> +void fill_bias_act(float* tensor, + const float* bias, + int channel, + int channel_size, + bool flag_bias, + const operators::ActivationParam* act_param) { + float* data = tensor; + int cnt = channel_size >> 4; + int remain = channel_size % 16; + float32x4_t vzero = vdupq_n_f32(0.f); + if (act_param != nullptr && act_param->has_active) { + float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha); + for (int j = 0; j < channel; j++) { + float bias_data = flag_bias ? bias[j] : 0.f; + float* src = data + j * channel_size; + float* dst = data + j * channel_size; + float32x4_t vbias = vdupq_n_f32(bias_data); + if (cnt > 0) { + switch (act_param->active_type) { + case lite_api::ActivationType::kRelu: +#ifdef __aarch64__ + asm volatile( + FILL_BIAS FILL_RELU FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vbias] "w"(vbias) + : "memory", "cc", "v0", "v1", "v2", "v3"); +#else + asm volatile( + FILL_BIAS FILL_RELU FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vbias] "w"(vbias) + : "memory", "cc", "q3", "q4", "q5", "q6"); +#endif + break; + case lite_api::ActivationType::kRelu6: +#ifdef __aarch64__ + asm volatile( + FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias) + : "memory", "cc", "v0", "v1", "v2", "v3"); +#else + asm volatile( + FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias) + : "memory", "cc", "q3", "q4", "q5", "q6"); +#endif + break; + case lite_api::ActivationType::kLeakyRelu: +#ifdef __aarch64__ + asm volatile( + FILL_BIAS FILL_LEAKY_RELU FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias) + : "memory", + "cc", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); +#else + asm volatile( + FILL_BIAS FILL_LEAKY_RELU FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias) + : "memory", + "cc", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + break; + default: + LOG(FATAL) << "this act_type: " + << static_cast(act_param->active_type) + << " fuse not support"; + } + } + // remain + switch (act_param->active_type) { + case lite_api::ActivationType::kRelu: + for (int i = 0; i < remain; i++) { + *dst = *src >= 0.f ? *src : 0.f; + src++; + dst++; + } + case lite_api::ActivationType::kRelu6: + for (int i = 0; i < remain; i++) { + float tmp = *src >= 0.f ? *src : 0.f; + *dst = tmp <= act_param->Relu_clipped_coef + ? tmp + : act_param->Relu_clipped_coef; + src++; + dst++; + } + case lite_api::ActivationType::kLeakyRelu: + for (int i = 0; i < remain; i++) { + if (*src >= 0.f) { + *dst = *src; + } else { + *dst = *src * act_param->Leaky_relu_alpha; + } + src++; + dst++; + } + break; + default: + LOG(FATAL) << "this act_type: " + << static_cast(act_param->active_type) + << " fuse not support"; + } + } + } else { + for (int j = 0; j < channel; ++j) { + float bias_data = flag_bias ? bias[j] : 0.f; + float32x4_t vbias = vdupq_n_f32(bias_data); + float* src = data + j * channel_size; + float* dst = data + j * channel_size; +#ifdef __aarch64__ + asm volatile(FILL_BIAS FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vbias] "w"(vbias) + : "memory", "cc", "v0", "v1", "v2", "v3"); +#else + asm volatile(FILL_BIAS FILL_STORE + : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) + : [vbias] "w"(vbias) + : "memory", "cc", "q3", "q4", "q5", "q6"); +#endif + } + } +} } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/fill_bias_relu.h b/lite/backends/arm/math/fill_bias_relu.h index 254d6d43be..ce775a96a1 100644 --- a/lite/backends/arm/math/fill_bias_relu.h +++ b/lite/backends/arm/math/fill_bias_relu.h @@ -37,7 +37,22 @@ void fill_bias_relu(Dtype* tensor, int channel_size, bool flag_bias, bool flag_relu); - +/** + * * \brief neon implementation to add bias and activation(relu, relu6, + * leakyrelu) + * * @param tensor + * * @param bias + * * @param channel + * * @param channel_size + * + */ +template +void fill_bias_act(Dtype* tensor, + const Dtype* bias, + int channel, + int channel_size, + bool flag_bias, + const operators::ActivationParam* act_param); } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/gru_utils.h b/lite/backends/arm/math/gru_utils.h index 9bef1889b8..9d57f81fc5 100644 --- a/lite/backends/arm/math/gru_utils.h +++ b/lite/backends/arm/math/gru_utils.h @@ -383,6 +383,8 @@ struct GRUUnitFunctor { const lite_api::ActivationType active_gate, bool origin_mode, ARMContext* ctx) { + operators::ActivationParam act_param; + act_param.has_active = false; if (value.prev_out_value) { sgemm(false, false, @@ -399,7 +401,7 @@ struct GRUUnitFunctor { frame_size * 3, nullptr, false, - false, + act_param, ctx); } gru_unit_reset_act(active_gate, value, frame_size, batch_size); @@ -420,7 +422,7 @@ struct GRUUnitFunctor { frame_size * 3, nullptr, false, - false, + act_param, ctx); } diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index 092e6937c4..cb9c049d81 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -14,6 +14,7 @@ #include "lite/backends/arm/math/packed_sgemm.h" #include +#include "lite/backends/arm/math/conv_block_utils.h" namespace paddle { namespace lite { @@ -51,7 +52,7 @@ void sgemm_prepacked_8x12(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx); void pack_m4(float *out, @@ -83,7 +84,7 @@ void sgemm_prepacked_4x4(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx); #else // for kA72 @@ -136,7 +137,7 @@ void sgemm_prepacked_6x8(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx); // for kA73, 4x8 void sgemm_prepacked_4x8(bool is_transB, @@ -151,7 +152,7 @@ void sgemm_prepacked_4x8(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx); #endif // __aarch64__ @@ -249,7 +250,7 @@ void sgemm_prepack(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx) { #ifdef __aarch64__ if (M <= 4) { @@ -265,7 +266,7 @@ void sgemm_prepack(bool is_transB, ldc, bias, has_bias, - has_relu, + act_param, ctx); } else { sgemm_prepacked_8x12(is_transB, @@ -280,7 +281,7 @@ void sgemm_prepack(bool is_transB, ldc, bias, has_bias, - has_relu, + act_param, ctx); } #else // armv7 @@ -297,7 +298,7 @@ void sgemm_prepack(bool is_transB, ldc, bias, has_bias, - has_relu, + act_param, ctx); } else { sgemm_prepacked_6x8(is_transB, @@ -312,7 +313,7 @@ void sgemm_prepack(bool is_transB, ldc, bias, has_bias, - has_relu, + act_param, ctx); } #endif // arm64 @@ -2283,7 +2284,7 @@ void sgemm_prepacked_8x12(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx) { size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto workspace = ctx->workspace_data(); @@ -2837,33 +2838,6 @@ void sgemm_prepacked_8x12(bool is_transB, "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ "11: \n" /* check if relu */ - "cbz %w[relu], 12f\n" /* skip relu */ - "movi v2.4s, #0\n" /* for relu*/ - "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ - "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ - "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ - "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ - "fmax v12.4s, v12.4s, v2.4s\n" /* relu*/ - "fmax v13.4s, v13.4s, v2.4s\n" /* relu*/ - "fmax v14.4s, v14.4s, v2.4s\n" /* relu*/ - "fmax v15.4s, v15.4s, v2.4s\n" /* relu*/ - "fmax v16.4s,v16.4s,v2.4s\n" /* relu*/ - "fmax v17.4s,v17.4s,v2.4s\n" /* relu*/ - "fmax v18.4s, v18.4s, v2.4s\n" /* relu*/ - "fmax v19.4s, v19.4s, v2.4s\n" /* relu*/ - "fmax v20.4s, v20.4s, v2.4s\n" /* relu*/ - "fmax v21.4s, v21.4s, v2.4s\n" /* relu*/ - "fmax v22.4s, v22.4s, v2.4s\n" /* relu*/ - "fmax v23.4s, v23.4s, v2.4s\n" /* relu*/ - "fmax v24.4s,v24.4s,v2.4s\n" /* relu*/ - "fmax v25.4s,v25.4s,v2.4s\n" /* relu*/ - "fmax v26.4s, v26.4s, v2.4s\n" /* relu*/ - "fmax v27.4s, v27.4s, v2.4s\n" /* relu*/ - "fmax v28.4s, v28.4s, v2.4s\n" /* relu*/ - "fmax v29.4s, v29.4s, v2.4s\n" /* relu*/ - "fmax v30.4s, v30.4s, v2.4s\n" /* relu*/ - "fmax v31.4s, v31.4s, v2.4s\n" /* relu*/ - "12: \n" "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ @@ -2886,7 +2860,6 @@ void sgemm_prepacked_8x12(bool is_transB, [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7) : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), [has_beta] "r"(has_beta), [beta] "r"(beta) : "cc","memory", @@ -2911,6 +2884,13 @@ void sgemm_prepacked_8x12(bool is_transB, } } } + if (act_param.has_active) { +#pragma omp parallel for num_threads(threads) + for (unsigned int x = 0; x < M; x++) { + float *dst = C + x * ldc; + act_switch_process(dst, dst, N, &act_param); + } + } } void sgemm_prepacked_4x4(bool is_transB, @@ -2925,7 +2905,7 @@ void sgemm_prepacked_4x4(bool is_transB, int ldc, const float *bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext *ctx) { size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto workspace = ctx->workspace_data(); @@ -3158,13 +3138,6 @@ void sgemm_prepacked_4x4(bool is_transB, "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ "11: \n" /* check if relu */ - "cbz %w[relu], 12f\n" /* skip relu */ - "movi v2.4s, #0\n" /* for relu*/ - "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ - "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ - "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ - "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ - "12: \n" "st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */ "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */ "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */ @@ -3179,7 +3152,6 @@ void sgemm_prepacked_4x4(bool is_transB, [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3) : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), [has_beta] "r"(has_beta), [beta] "r"(beta) : "cc","memory", @@ -3197,6 +3169,13 @@ void sgemm_prepacked_4x4(bool is_transB, } } } + if (act_param.has_active) { +#pragma omp parallel for num_threads(threads) + for (unsigned int x = 0; x < M; x++) { + float *dst = C + x * ldc; + act_switch_process(dst, dst, N, &act_param); + } + } } #else // __aarch64__ /** @@ -3222,7 +3201,7 @@ void sgemm_prepacked_6x8(bool is_transB, int ldc, const float* bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext* ctx) { size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); @@ -3601,22 +3580,6 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q4, q4, q0 @ for relu\n" - "vmax.f32 q5, q5, q0 @ for relu\n" - "vmax.f32 q6, q6, q0 @ for relu\n" - "vmax.f32 q7, q7, q0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" "vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n" @@ -3634,7 +3597,6 @@ void sgemm_prepacked_6x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), [beta] "r"(beta) : "q0","q1","q2","q3","q4", "q5","q6","q7","q8","q9","q10","q11", @@ -3654,6 +3616,13 @@ void sgemm_prepacked_6x8(bool is_transB, } } } + if (act_param.has_active) { +#pragma omp parallel for num_threads(threads) + for (unsigned int x = 0; x < M; x++) { + float* dst = C + x * ldc; + act_switch_process(dst, dst, N, &act_param); + } + } } void sgemm_prepacked_4x8(bool is_transB, @@ -3668,7 +3637,7 @@ void sgemm_prepacked_4x8(bool is_transB, int ldc, const float* bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext* ctx) { size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); @@ -3953,18 +3922,6 @@ void sgemm_prepacked_4x8(bool is_transB, /*aptr - 16*/ "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" "vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n" @@ -3978,7 +3935,6 @@ void sgemm_prepacked_4x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), [beta] "r"(beta) : "q0","q1","q2","q3", "q4","q5","q6","q7","q8","q9","q10", @@ -3995,6 +3951,13 @@ void sgemm_prepacked_4x8(bool is_transB, } } } + if (act_param.has_active) { +#pragma omp parallel for num_threads(threads) + for (unsigned int x = 0; x < M; x++) { + float* dst = C + x * ldc; + act_switch_process(dst, dst, N, &act_param); + } + } } #endif // __aarch64__ diff --git a/lite/backends/arm/math/packed_sgemm.h b/lite/backends/arm/math/packed_sgemm.h index 6c14cdb2ef..bc23e8eab7 100644 --- a/lite/backends/arm/math/packed_sgemm.h +++ b/lite/backends/arm/math/packed_sgemm.h @@ -17,6 +17,7 @@ #include #include "lite/core/context.h" #include "lite/core/tensor.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { @@ -74,7 +75,7 @@ void sgemm_prepack(bool is_transB, int ldc, const float* bias, bool has_bias, - bool has_relu, + const operators::ActivationParam act_param, ARMContext* ctx); } // namespace math diff --git a/lite/backends/arm/math/sgemm.cc b/lite/backends/arm/math/sgemm.cc index f3123ddd71..f2ba090222 100644 --- a/lite/backends/arm/math/sgemm.cc +++ b/lite/backends/arm/math/sgemm.cc @@ -34,7 +34,7 @@ void sgemm(bool is_transA, int ldc, const float* bias, bool is_bias, - bool is_relu, + const operators::ActivationParam act_param, ARMContext* ctx) { int hblock = get_hblock(ctx); int m_roundup = hblock * ((M + hblock - 1) / hblock); @@ -56,7 +56,7 @@ void sgemm(bool is_transA, ldc, bias, is_bias, - is_relu, + act_param, ctx); TargetFree(TargetType::kARM, packed_A); } diff --git a/lite/backends/arm/math/sgemm.h b/lite/backends/arm/math/sgemm.h index 08f68fb3d4..b48080855f 100644 --- a/lite/backends/arm/math/sgemm.h +++ b/lite/backends/arm/math/sgemm.h @@ -39,7 +39,7 @@ void sgemm(bool is_transA, int ldc, const float* bias, bool is_bias, - bool is_relu, + const operators::ActivationParam act_param, ARMContext* ctx); } // namespace math diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc index 5c58b29713..6a6c903a43 100644 --- a/lite/kernels/arm/conv_transpose_compute.cc +++ b/lite/kernels/arm/conv_transpose_compute.cc @@ -103,6 +103,7 @@ void Conv2DTransposeCompute::Run() { auto din = param.x->data(); auto dout = param.output->mutable_data(); auto weights = param.filter->data(); + auto act_param = param.activation_param; for (int i = 0; i < num; i++) { const float* din_batch = din + i * chin * hin * win; float* dout_batch = dout + i * chout * hout * wout; @@ -115,7 +116,9 @@ void Conv2DTransposeCompute::Run() { const float* din_group = din_batch + g * group_size_in; const float* weights_group = weights + g * group_size_weights; float* coldata_group = col_data + g * group_size_coldata; - + if (flag_bias) { + act_param.has_active = false; + } lite::arm::math::sgemm_prepack(false, m, n, @@ -128,7 +131,7 @@ void Conv2DTransposeCompute::Run() { n, nullptr, false, - fuse_relu && (!flag_bias), + act_param, &ctx); } if (!flag_1x1s1p1) { diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc index 525eca269b..3b61e0b013 100644 --- a/lite/kernels/arm/fc_compute.cc +++ b/lite/kernels/arm/fc_compute.cc @@ -94,6 +94,8 @@ void FcCompute::Run() { b_data = bias_.data(); } if (flag_gemm_) { + operators::ActivationParam act_param; + act_param.has_active = false; lite::arm::math::sgemm(false, false, m_, @@ -109,7 +111,7 @@ void FcCompute::Run() { n_, nullptr, false, - false, + act_param, &ctx); if (param.bias) { CHECK_EQ(param.bias->numel(), n_); diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc index d00a5bdc06..afcbe7267c 100644 --- a/lite/kernels/arm/matmul_compute.cc +++ b/lite/kernels/arm/matmul_compute.cc @@ -42,6 +42,9 @@ void MatMulCompute::Run() { float alpha = param.alpha; auto& ctx = this->ctx_->template As(); + operators::ActivationParam act_param; + act_param.has_active = false; + if (x_dims.size() > 2 && y_dims.size() >= 2) { // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N] // x: [B, M, K], y: [K, N], out: [B, M, N] @@ -97,7 +100,6 @@ void MatMulCompute::Run() { if (x_transpose) { x_data_trans = static_cast(malloc(sizeof(float) * x_inner)); } - if (y_dims.size() > 2) { for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) { lite::arm::math::sgemm(x_transpose, @@ -115,7 +117,7 @@ void MatMulCompute::Run() { ldc, nullptr, false, - false, + act_param, &ctx); } } else { @@ -135,7 +137,7 @@ void MatMulCompute::Run() { ldc, nullptr, false, - false, + act_param, &ctx); } } @@ -200,7 +202,7 @@ void MatMulCompute::Run() { ldc, nullptr, false, - false, + act_param, &ctx); } else if (x_dims.size() > 2 && y_dims.size() == 1) { // x: [B, M, K], y: [K], out: [B, M] @@ -254,7 +256,7 @@ void MatMulCompute::Run() { ldc, nullptr, false, - false, + act_param, &ctx); } } diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc index debe9e907c..a5de6c202c 100644 --- a/lite/kernels/arm/mul_compute.cc +++ b/lite/kernels/arm/mul_compute.cc @@ -67,6 +67,8 @@ void MulCompute::Run() { if (is_tranposed_y) { ldb = k_; } + operators::ActivationParam act_param; + act_param.has_active = false; lite::arm::math::sgemm_prepack(is_tranposed_y, m_, n_, @@ -79,7 +81,7 @@ void MulCompute::Run() { n_, nullptr, false, - false, + act_param, &ctx); } } diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index cc576dcfe5..762b20ec83 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -11,8 +11,6 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_ lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc deleted file mode 100644 index 6c348076ba..0000000000 --- a/lite/tests/kernels/conv2d_transpose_compute_test.cc +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/core/arena/framework.h" - -namespace paddle { -namespace lite { - -inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); -} - -template -void col2im(const Dtype* data_col, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h0, - const int pad_h1, - const int pad_w0, - const int pad_w1, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - Dtype* data_im) { - memset(data_im, 0, height * width * channels * sizeof(float)); - const int output_h = - (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / - stride_h + - 1; - const int output_w = - (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + - 1; - const int channel_size = height * width; - for (int channel = channels; channel--; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h0 + kernel_row * dilation_h; - for (int output_rows = output_h; output_rows; output_rows--) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - data_col += output_w; - } else { - int input_col = -pad_w0 + kernel_col * dilation_w; - for (int output_col = output_w; output_col; output_col--) { - if (is_a_ge_zero_and_a_lt_b(input_col, width)) { - data_im[input_row * width + input_col] += *data_col; - } - data_col++; - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} - -template -void fill_bias_relu(Dtype* tensor, - const Dtype* bias, - int channel, - int channel_size, - bool flag_bias, - bool flag_relu); - -template <> -void fill_bias_relu(float* tensor, - const float* bias, - int channel, - int channel_size, - bool flag_bias, - bool flag_relu) { - float* data = tensor; - if (flag_relu) { - for (int j = 0; j < channel; ++j) { - float bias_data = flag_bias ? bias[j] : 0.f; - for (int i = 0; i < channel_size; i++) { - data[i] += bias_data; - data[i] = data[i] > 0 ? data[i] : 0.f; - } - data += channel_size; - } - } else { - for (int j = 0; j < channel; ++j) { - float bias_data = flag_bias ? bias[j] : 0.f; - for (int i = 0; i < channel_size; i++) { - data[i] += bias_data; - } - data += channel_size; - } - } -} - -inline void UpdatePaddingAndDilation(std::vector* paddings, - std::vector* dilations, - const std::vector& strides, - const std::string padding_algorithm, - const DDim data_dims, - const std::vector& ksize) { - // when padding_desc is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (size_t i = 0; i < strides.size(); ++i) { - int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; - int pad_sum = std::max( - (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], - (int64_t)0); - int pad_0 = pad_sum / 2; - int pad_1 = pad_sum - pad_0; - // pad - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - // dilation - *(dilations->begin() + i) = 1; - } - } else if (padding_algorithm == "VALID") { - for (auto& it : *paddings) { - it = 0; - } - } -} - -template -static void basic_gemm(int m, - int n, - int k, - const type* a, - const type* b, - const type2* bias, - type2* c, - type2 alpha, - type2 beta, - bool trans_a = false, - bool trans_b = false, - bool flag_bias = false, - bool flag_relu = false) { -#pragma omp parallel for - for (int i = 0; i < m; ++i) { - type2 bias_data = (type2)0; - if (flag_bias) { - bias_data = bias[i]; - } - for (int j = 0; j < n; ++j) { - type2 sum = static_cast(0); - for (int l = 0; l < k; ++l) { - type av; - type bv; - if (trans_a) { - av = a[l * m + i]; - } else { - av = a[i * k + l]; - } - if (trans_b) { - bv = b[j * k + l]; - } else { - bv = b[l * n + j]; - } - sum += av * bv; - } - type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; - if (flag_relu) { - c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0; - } else { - c[i * n + j] = tmp; - } - } - } -} - -//! for float, dtype1 and type2 is float -//! for int8, dytpe1 is char, dtype2 is int -template -bool deconv_basic(const Dtype1* din, - Dtype2* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const Dtype1* weights, - const Dtype2* bias, - int group, - int kernel_w, - int kernel_h, - int stride_w, - int stride_h, - int dila_w, - int dila_h, - int pad_w0, - int pad_w1, - int pad_h0, - int pad_h1, - bool flag_bias, - bool flag_relu) { - int m = chout * kernel_w * kernel_h / group; - int n = hin * win; - int k = chin / group; - - if (chin != chout || group != chin) { - CHECK_OR_FALSE(chin % group == 0); - CHECK_OR_FALSE(chout % group == 0); - } - lite::Tensor workspace_tensor; - std::vector wt_shape = {1, 1, 1, group * m * n}; - workspace_tensor.Resize(wt_shape); - auto* workspace_ptr = workspace_tensor.mutable_data(); - - int group_size_in = win * hin * chin / group; - int group_size_coldata = m * n; - int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); - bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) && - (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) && - (dila_h == 1); - - for (int i = 0; i < num; ++i) { - const Dtype1* din_batch = din + i * chin * hin * win; - Dtype2* dout_batch = dout + i * chout * hout * wout; - - Dtype2* col_data = workspace_ptr; - if (flag_1x1s1p1) { - col_data = dout_batch; - } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group); - for (int g = 0; g < group; ++g) { - const Dtype1* din_group = din_batch + g * group_size_in; - const Dtype1* weights_group = weights + g * group_size_weights; - Dtype2* coldata_group = col_data + g * group_size_coldata; - basic_gemm(m, - n, - k, - weights_group, - din_group, - nullptr, - coldata_group, - (Dtype2)1, - (Dtype2)0, - true, - false, - false, - (!flag_bias && flag_relu)); - } - if (!flag_1x1s1p1) { - col2im(col_data, - chout, - hout, - wout, - kernel_h, - kernel_w, - pad_h0, - pad_h1, - pad_w0, - pad_w1, - stride_h, - stride_w, - dila_h, - dila_w, - dout_batch); - } - if (flag_bias) { - fill_bias_relu( - dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); - } - } - return true; -} - -class Conv2DTransposeComputeTester : public arena::TestCase { - protected: - // common attributes for this op. - std::string x_ = "x"; - std::string output_ = "out"; - std::string filter_ = "filter"; - std::string bias_ = "bias"; - std::string padding_algorithm_ = ""; - - std::vector strides_{1, 1}; - std::vector paddings_{0, 0, 0, 0}; - int groups_{1}; - std::vector dilations_{1, 1}; - bool flag_relu_{false}; - - int n_ = 1; - int ic_ = 1; - int oc_ = 1; - int ih_ = 9; - int iw_ = 9; - bool flag_bias_ = false; - int ks_ = 1; - - public: - Conv2DTransposeComputeTester(const Place& place, - const std::string& alias, - int n, - int ic, - int oc, - int ih, - int iw, - bool flag_bias, - bool flag_relu, - int dilation, - int stride, - int pad_h0, - int pad_h1, - int pad_w0, - int pad_w1, - int ks, - int groups, - std::string padding_algorithm) - : TestCase(place, alias) { - n_ = n; - ic_ = ic; - oc_ = oc; - ih_ = ih; - iw_ = iw; - ks_ = ks; - flag_bias_ = flag_bias; - padding_algorithm_ = padding_algorithm; - strides_ = std::vector({stride, stride}); - paddings_ = std::vector({pad_h0, pad_h1, pad_w0, pad_w1}); - dilations_ = std::vector({dilation, dilation}); - groups_ = groups; - flag_relu_ = flag_relu; - } - - void RunBaseline(Scope* scope) override { - auto* out = scope->NewTensor(output_); - CHECK(out); - auto* x = scope->FindTensor(x_); - auto input_dim = x->dims(); - std::vector ksize({1, 1, ks_, ks_}); - UpdatePaddingAndDilation(&paddings_, - &dilations_, - strides_, - padding_algorithm_, - input_dim, - ksize); - int oh = (ih_ - 1) * strides_[0] - paddings_[0] - paddings_[1] + - dilations_[0] * (ks_ - 1) + 1; - int ow = (iw_ - 1) * strides_[1] - paddings_[2] - paddings_[3] + - dilations_[1] * (ks_ - 1) + 1; - CHECK(oh > 0 || ow > 0); - - std::vector output_shape = {n_, oc_, oh, ow}; - DDim output_dims(output_shape); - out->Resize(output_dims); - auto* output_data = out->mutable_data(); - - const auto* x_data = x->data(); - auto* filter = scope->FindTensor(filter_); - const auto* filter_data = filter->data(); - const float* bias_data = nullptr; - if (flag_bias_) { - auto* bias = scope->FindTensor(bias_); - bias_data = bias->data(); - } - - deconv_basic(x_data, - output_data, - n_, - oc_, - oh, - ow, - ic_, - ih_, - iw_, - filter_data, - bias_data, - groups_, - ks_, - ks_, - strides_[1], - strides_[0], - dilations_[1], - dilations_[0], - paddings_[2], - paddings_[3], - paddings_[0], - paddings_[1], - flag_bias_, - flag_relu_); - } - - void PrepareOpDesc(cpp::OpDesc* op_desc) { - op_desc->SetType("conv2d_transpose"); - op_desc->SetInput("Input", {x_}); - op_desc->SetInput("Filter", {filter_}); - op_desc->SetOutput("Output", {output_}); - op_desc->SetAttr("strides", strides_); - op_desc->SetAttr("paddings", paddings_); - op_desc->SetAttr("groups", groups_); - op_desc->SetAttr("dilations", dilations_); - if (flag_bias_) { - op_desc->SetInput("Bias", {bias_}); - } - op_desc->SetAttr("fuse_relu", flag_relu_); - op_desc->SetAttr("padding_algorithm", padding_algorithm_); - } - - void PrepareData() override { - std::vector input_shape = {n_, ic_, ih_, iw_}; - std::vector filter_shape = {ic_, oc_ / groups_, ks_, ks_}; - std::vector bias_shape = {1, oc_, 1, 1}; - - // x tensor - DDim x_dims(input_shape); - std::vector x_data(x_dims.production()); - for (int i = 0; i < x_dims.production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - x_data[i] = sign * static_cast(i % 128) * 0.013f + 0.001; - } - SetCommonTensor(x_, x_dims, x_data.data()); - - // filter tensor - DDim filter_dims(filter_shape); - std::vector filter_data(filter_dims.production()); - for (int i = 0; i < filter_dims.production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - filter_data[i] = sign * static_cast(i % 128) * 0.01f + 0.001; - } - SetCommonTensor(filter_, filter_dims, filter_data.data()); - - // bias tensor - if (flag_bias_) { - DDim bias_dims(bias_shape); - std::vector bias_data(bias_dims.production()); - for (int i = 0; i < bias_dims.production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - bias_data[i] = sign * static_cast(i % 128) * 0.01f + 0.001; - } - SetCommonTensor(bias_, bias_dims, bias_data.data()); - } - } -}; - -TEST(conv2d_transpose, precision) { - LOG(INFO) << "test conv2d_transpose op"; -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - for (auto n : {2}) { - for (auto ic : {1, 4 /*, 128*/}) { - for (auto oc : {1, 4 /*, 128*/}) { - LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc; - for (auto ih : {8, 8 /*, 56 , 112, 224, 512*/}) { - for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) { - for (auto flag_bias : {false, true}) { - for (auto flag_relu : {false, true}) { - for (auto dilation : {1, 2}) { - for (auto stride : {1, 2}) { - for (auto pad_h0 : {0, 1}) { - for (auto pad_h1 : {0, 1}) { - for (auto pad_w0 : {0, 1}) { - for (auto pad_w1 : {0, 1}) { - for (auto ks : {1, 4}) { - for (auto group : {1, 2}) { - for (auto padding_algorithm : - {"", "SAME", "VALID"}) { - // obtain shape - // LOG(INFO) << "n:" << n << ",ic:" << ic << - // ",oc:" << - // oc - // << ",ih:" << ih << ",iw:" << iw - // << ",flag_bias:" << flag_bias - // << ",flag_relu:" << flag_relu - // << ",dila:" << dilation - // << ",stride:" << stride - // << ",padding:" << padding << - // ",ks:" << ks - // << ",group:" << group; - if (ic % group != 0 || oc % group != 0) { - group = 1; - } - std::unique_ptr tester( - new Conv2DTransposeComputeTester( - place, - "def", - n, - ic, - oc, - ih, - iw, - flag_bias, - flag_relu, - dilation, - stride, - pad_h0, - pad_h1, - pad_w0, - pad_w1, - ks, - group, - padding_algorithm)); - arena::Arena arena( - std::move(tester), place, 2e-5); - arena.TestPrecision(); - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } -#endif -} - -} // namespace lite -} // namespace paddle diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc index 398e745d94..b5df49aa36 100644 --- a/lite/tests/math/conv_transpose_compute_test.cc +++ b/lite/tests/math/conv_transpose_compute_test.cc @@ -59,6 +59,7 @@ DEFINE_bool(flag_bias, false, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; +typedef paddle::lite::operators::ActivationParam ActivationParam; using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, @@ -117,6 +118,13 @@ void test_conv_transpose_fp32(const std::vector& input_dims, paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f); // paddle::lite::fill_tensor_const(*param.bias, 1.f); } + if (flag_relu) { + ActivationParam act_param; + act_param.has_active = true; + act_param.active_type = + (paddle::lite_api::ActivationType)1; // 2-relu6 4-leakyrelu + param.activation_param = act_param; + } Tensor tmp_weights; tmp_weights.Resize(weight_dim); tmp_weights.CopyDataFrom(*param.filter); diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc index 6df5e671fe..8295cef341 100644 --- a/lite/tests/math/sgemm_compute_test.cc +++ b/lite/tests/math/sgemm_compute_test.cc @@ -22,9 +22,11 @@ #include "lite/core/context.h" #include "lite/core/profile/timer.h" #include "lite/core/tensor.h" +#include "lite/operators/op_params.h" #include "lite/tests/utils/tensor_utils.h" typedef paddle::lite::Tensor Tensor; +typedef paddle::lite::operators::ActivationParam ActivationParam; using paddle::lite::profile::Timer; DEFINE_int32(power_mode, @@ -136,6 +138,12 @@ bool test_sgemm(bool tra, has_relu); } Timer t0; + ActivationParam act_param; + if (has_relu) { + act_param.has_active = true; + act_param.active_type = + (paddle::lite_api::ActivationType)1; // 2-relu6 4-leakyrelu + } #ifdef LITE_WITH_ARM //! compute double ops = 2.0 * m * n * k; @@ -163,7 +171,7 @@ bool test_sgemm(bool tra, ldc, dbias, has_bias, - has_relu, + act_param, &ctx); } @@ -184,7 +192,7 @@ bool test_sgemm(bool tra, ldc, dbias, has_bias, - has_relu, + act_param, &ctx); t0.Stop(); } -- GitLab