From 9348cb4119e0a38b1b4abf795c991eb777f8cdbd Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Mon, 17 Aug 2020 18:28:12 +0800 Subject: [PATCH] fix format --- .../arm/math/conv5x5s1_depthwise_fp32.cc | 254 ++++-------------- lite/backends/arm/math/conv_impl.cc | 3 +- lite/kernels/arm/conv_depthwise.cc | 3 +- 3 files changed, 60 insertions(+), 200 deletions(-) diff --git a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc index 1cf37ca14b..f20c31c13f 100644 --- a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc @@ -1211,13 +1211,11 @@ inline void compute_all_padding_pre(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { int tmp_index = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[4], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp_index - k], weights[3 - k], 0.f, weights[5][3 - k], 4 - i); @@ -1429,7 +1427,7 @@ inline void compute_all_padding_pre(float* dout, *dout++ = sum; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[4], bias[0], weights[4][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -1447,14 +1445,12 @@ inline void compute_all_padding_mid(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { // left int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -1531,7 +1527,7 @@ inline void compute_all_padding_mid(float* dout, *dout++ = sum; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -1550,15 +1546,13 @@ inline void compute_all_padding_mid_out2(float* dout0, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { int tmp1 = num + 1; int tmp = num - 1; // left - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); float sum1 = compute_one_data_pre(din_ptr_arr[tmp1], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { @@ -1647,7 +1641,7 @@ inline void compute_all_padding_mid_out2(float* dout0, *dout1++ = sum1; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); float sum1 = compute_one_data_post(din_ptr_arr[tmp1], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[tmp1]++; @@ -1660,11 +1654,6 @@ inline void compute_all_padding_mid_out2(float* dout0, *dout0++ = sum; *dout1++ = sum1; } - - for (int w = pad_right; w > 4; w--) { - *dout0++ = bias[0]; - *dout1++ = bias[0]; - } } inline void compute_all_padding_post(float* dout, const float** din_ptr_arr, @@ -1674,17 +1663,12 @@ inline void compute_all_padding_post(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { // left - /* for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0]; - }*/ int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[3], weights[num], bias[0], weights[5][num], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[2 - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -1894,7 +1878,7 @@ inline void compute_all_padding_post(float* dout, } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[3], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[3]++; for (int k = 0; k < num; k++) { @@ -1927,10 +1911,6 @@ void conv_depthwise_5x5s1_bias(float* dout, int out_size = wout * hout; int cnt = loop_w >> 2; int remain = loop_w & 3; - int pad_left_new = pad_left > 4 ? 4 : pad_left; - int pad_right_new = pad_right > 4 ? 4 : pad_right; - int pad_top_new = pad_top > 4 ? 4 : pad_top; - int pad_bottom_new = pad_bottom > 4 ? 4 : pad_bottom; int in_channel_size = chin * in_size; int out_channel_size = chin * out_size; int weights_size = 25; @@ -1968,9 +1948,9 @@ void conv_depthwise_5x5s1_bias(float* dout, const float* din_ptr_arr[] = {din_ptr0, din_ptr1, din_ptr2, din_ptr3, din_ptr4, din_ptr5}; float32x4_t weights_vec[] = {wr0, wr1, wr2, wr3, wr4, wr5, wr6}; // top_h - for (int h = pad_top_new; h > 0; h--) { + for (int h = pad_top; h > 0; h--) { compute_all_padding_pre(dout_ptr0, din_ptr_arr, vbias, weights_vec, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4 - h); + pad_right, cnt, remain, 4 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -1982,7 +1962,7 @@ void conv_depthwise_5x5s1_bias(float* dout, // mid_h for (int h = 0; h < loop_h - 1; h += 2) { compute_all_padding_mid_out2(dout_ptr0, dout_ptr1, din_ptr_arr, vbias, weights_vec, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4); + pad_right, cnt, remain, 4); dout_ptr0 += num_out; dout_ptr1 += num_out; din_ptr0 = din_ptr2; @@ -2000,7 +1980,7 @@ void conv_depthwise_5x5s1_bias(float* dout, } if (loop_h % 2 != 0) { compute_all_padding_mid(dout_ptr0, din_ptr_arr, vbias, weights_vec, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4); + pad_right, cnt, remain, 4); dout_ptr0 = dout_ptr1; din_ptr0 = din_ptr1; din_ptr1 = din_ptr2; @@ -2014,9 +1994,9 @@ void conv_depthwise_5x5s1_bias(float* dout, din_ptr_arr[4] = din_ptr4; } // bottom - for (int h = 0; h < pad_bottom_new; h++) { + for (int h = 0; h < pad_bottom; h++) { compute_all_padding_post(dout_ptr0, din_ptr_arr, vbias, weights_vec, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 3 - h); + pad_right, cnt, remain, 3 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -2037,17 +2017,11 @@ inline void compute_all_padding_pre_relu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { int tmp_index = num - 1; - // left - /* for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[4], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp_index - k], weights[3 - k], 0.f, weights[5][3 - k], 4 - i); @@ -2267,7 +2241,7 @@ inline void compute_all_padding_pre_relu(float* dout, *dout++ = sum > 0.f ? sum : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[4], bias[0], weights[4][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -2276,10 +2250,6 @@ inline void compute_all_padding_pre_relu(float* dout, } *dout++ = sum > 0.f ? sum : 0.f; } - /* - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ } inline void compute_all_padding_mid_relu(float* dout, const float** din_ptr_arr, @@ -2290,17 +2260,11 @@ inline void compute_all_padding_mid_relu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { - // left - /* for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -2378,7 +2342,7 @@ inline void compute_all_padding_mid_relu(float* dout, *dout++ = sum > 0.f ? sum : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -2387,10 +2351,6 @@ inline void compute_all_padding_mid_relu(float* dout, } *dout++ = sum > 0.f ? sum : 0.f; } - /* - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ } inline void compute_all_padding_mid_relu_out2(float* dout0, float* dout1, @@ -2402,19 +2362,13 @@ inline void compute_all_padding_mid_relu_out2(float* dout0, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { // left - for (int w = pad_left; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? bias[0] : 0.f; - *dout1++ = bias[0] > 0.f ? bias[0] : 0.f; - } int tmp = num - 1; int tmp1 = num + 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); float sum1 = compute_one_data_pre(din_ptr_arr[tmp1], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { @@ -2505,7 +2459,7 @@ inline void compute_all_padding_mid_relu_out2(float* dout0, *dout1++ = sum1 > 0.f ? sum1 : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); float sum1 = compute_one_data_post(din_ptr_arr[tmp1], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[tmp1]++; @@ -2518,10 +2472,6 @@ inline void compute_all_padding_mid_relu_out2(float* dout0, *dout0++ = sum > 0.f ? sum : 0.f; *dout1++ = sum1 > 0.f ? sum1 : 0.f; } - for (int w = pad_right; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? bias[0] : 0.f; - *dout1++ = bias[0] > 0.f ? bias[0] : 0.f; - } } inline void compute_all_padding_post_relu(float* dout, const float** din_ptr_arr, @@ -2532,17 +2482,12 @@ inline void compute_all_padding_post_relu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { // left - /*for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[3], weights[num], bias[0], weights[5][num], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[2 - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -2760,7 +2705,7 @@ inline void compute_all_padding_post_relu(float* dout, } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[3], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[3]++; for (int k = 0; k < num; k++) { @@ -2769,10 +2714,6 @@ inline void compute_all_padding_post_relu(float* dout, } *dout++ = sum > 0.f ? sum : 0.f; } -/* - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : 0.f; - }*/ } void conv_depthwise_5x5s1_bias_relu(float* dout, @@ -2797,10 +2738,6 @@ void conv_depthwise_5x5s1_bias_relu(float* dout, int out_size = wout * hout; int cnt = loop_w >> 2; int remain = loop_w & 3; - int pad_left_new = pad_left > 4 ? 4 : pad_left; - int pad_right_new = pad_right > 4 ? 4 : pad_right; - int pad_top_new = pad_top > 4 ? 4 : pad_top; - int pad_bottom_new = pad_bottom > 4 ? 4 : pad_bottom; int in_channel_size = chin * in_size; int out_channel_size = chin * out_size; int weights_size = 25; @@ -2839,9 +2776,9 @@ void conv_depthwise_5x5s1_bias_relu(float* dout, const float* din_ptr_arr[] = {din_ptr0, din_ptr1, din_ptr2, din_ptr3, din_ptr4, din_ptr5}; float32x4_t weights_vec[] = {wr0, wr1, wr2, wr3, wr4, wr5, wr6}; // top_h - for (int h = pad_top_new; h > 0; h--) { + for (int h = pad_top; h > 0; h--) { compute_all_padding_pre_relu(dout_ptr0, din_ptr_arr, vbias, weights_vec, vzero, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4 - h); + pad_right, cnt, remain, 4 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -2853,7 +2790,7 @@ void conv_depthwise_5x5s1_bias_relu(float* dout, // mid_h for (int h = 0; h < loop_h - 1; h += 2) { compute_all_padding_mid_relu_out2(dout_ptr0, dout_ptr1, din_ptr_arr, vbias, weights_vec, vzero, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4); + pad_right, cnt, remain, 4); dout_ptr0 += num_out; dout_ptr1 += num_out; din_ptr0 = din_ptr2; @@ -2871,7 +2808,7 @@ void conv_depthwise_5x5s1_bias_relu(float* dout, } if (loop_h % 2 != 0) { compute_all_padding_mid_relu(dout_ptr0, din_ptr_arr, vbias, weights_vec, vzero, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 4); + pad_right, cnt, remain, 4); dout_ptr0 = dout_ptr1; din_ptr0 = din_ptr1; din_ptr1 = din_ptr2; @@ -2885,9 +2822,9 @@ void conv_depthwise_5x5s1_bias_relu(float* dout, din_ptr_arr[4] = din_ptr4; } // bottom - for (int h = 0; h < pad_bottom_new; h++) { + for (int h = 0; h < pad_bottom; h++) { compute_all_padding_post_relu(dout_ptr0, din_ptr_arr, vbias, weights_vec, vzero, win, wout, pad_left, - pad_right, pad_left_new, pad_right_new, cnt, remain, 3 - h); + pad_right, cnt, remain, 3 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -2909,8 +2846,6 @@ inline void compute_all_padding_pre_relu6(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -2919,10 +2854,7 @@ inline void compute_all_padding_pre_relu6(float* dout, #endif int tmp_index = num - 1; // left - for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[4], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp_index - k], weights[3 - k], 0.f, weights[5][3 - k], 4 - i); @@ -3150,7 +3082,7 @@ inline void compute_all_padding_pre_relu6(float* dout, *dout++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[4], bias[0], weights[4][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -3159,10 +3091,6 @@ inline void compute_all_padding_pre_relu6(float* dout, } *dout++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; } - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } - } inline void compute_all_padding_mid_relu6(float* dout, const float** din_ptr_arr, @@ -3174,8 +3102,6 @@ inline void compute_all_padding_mid_relu6(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -3183,11 +3109,8 @@ inline void compute_all_padding_mid_relu6(float* dout, float32x4_t vsix = vld1q_f32(six); #endif // left - for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -3267,7 +3190,7 @@ inline void compute_all_padding_mid_relu6(float* dout, *dout++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -3276,9 +3199,6 @@ inline void compute_all_padding_mid_relu6(float* dout, } *dout++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; } - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } } inline void compute_all_padding_mid_relu6_out2(float* dout0, @@ -3292,8 +3212,6 @@ inline void compute_all_padding_mid_relu6_out2(float* dout0, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -3301,13 +3219,9 @@ inline void compute_all_padding_mid_relu6_out2(float* dout0, float32x4_t vsix = vld1q_f32(six); #endif // left - for (int w = pad_left; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - *dout1++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } int tmp = num - 1; int tmp1 = num + 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); float sum1 = compute_one_data_pre(din_ptr_arr[tmp1], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { @@ -3399,7 +3313,7 @@ inline void compute_all_padding_mid_relu6_out2(float* dout0, *dout1++ = sum1 > 0.f ? (sum1 < six[0] ? sum1 : six[0]) : 0.f; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); float sum1 = compute_one_data_post(din_ptr_arr[tmp1], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[tmp1]++; @@ -3412,10 +3326,6 @@ inline void compute_all_padding_mid_relu6_out2(float* dout0, *dout0++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; *dout1++ = sum1 > 0.f ? (sum1 < six[0] ? sum1 : six[0]) : 0.f; } - for (int w = pad_right; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - *dout1++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } } inline void compute_all_padding_post_relu6(float* dout, const float** din_ptr_arr, @@ -3427,8 +3337,6 @@ inline void compute_all_padding_post_relu6(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -3436,11 +3344,8 @@ inline void compute_all_padding_post_relu6(float* dout, float32x4_t vsix = vld1q_f32(six); #endif // left - for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[3], weights[num], bias[0], weights[5][num], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[2 - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -3666,7 +3571,7 @@ inline void compute_all_padding_post_relu6(float* dout, } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[3], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[3]++; for (int k = 0; k < num; k++) { @@ -3675,9 +3580,6 @@ inline void compute_all_padding_post_relu6(float* dout, } *dout++ = sum > 0.f ? (sum < six[0] ? sum : six[0]) : 0.f; } - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? (bias[0] < six[0] ? bias[0] : six[0]) : 0.f; - } } void conv_depthwise_5x5s1_bias_relu6(float* dout, @@ -3703,10 +3605,6 @@ void conv_depthwise_5x5s1_bias_relu6(float* dout, int out_size = wout * hout; int cnt = loop_w >> 2; int remain = loop_w & 3; - int pad_left_new = pad_left > 4 ? 4 : pad_left; - int pad_right_new = pad_right > 4 ? 4 : pad_right; - int pad_top_new = pad_top > 4 ? 4 : pad_top; - int pad_bottom_new = pad_bottom > 4 ? 4 : pad_bottom; int in_channel_size = chin * in_size; int out_channel_size = chin * out_size; int weights_size = 25; @@ -3745,10 +3643,9 @@ void conv_depthwise_5x5s1_bias_relu6(float* dout, const float* din_ptr_arr[] = {din_ptr0, din_ptr1, din_ptr2, din_ptr3, din_ptr4, din_ptr5}; float32x4_t weights_vec[] = {wr0, wr1, wr2, wr3, wr4, wr5, wr6}; // top_h - for (int h = pad_top_new; h > 0; h--) { + for (int h = pad_top; h > 0; h--) { compute_all_padding_pre_relu6(dout_ptr0, din_ptr_arr, vbias, six, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4 - h); + win, wout, pad_left, pad_right, cnt, remain, 4 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -3760,8 +3657,7 @@ void conv_depthwise_5x5s1_bias_relu6(float* dout, // mid_h for (int h = 0; h < loop_h - 1; h += 2) { compute_all_padding_mid_relu6_out2(dout_ptr0, dout_ptr1, din_ptr_arr, vbias, six, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4); + win, wout, pad_left, pad_right, cnt, remain, 4); dout_ptr0 += num_out; dout_ptr1 += num_out; din_ptr0 = din_ptr2; @@ -3779,8 +3675,7 @@ void conv_depthwise_5x5s1_bias_relu6(float* dout, } if (loop_h % 2 != 0) { compute_all_padding_mid_relu6(dout_ptr0, din_ptr_arr, vbias, six, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4); + win, wout, pad_left, pad_right, cnt, remain, 4); dout_ptr0 = dout_ptr1; din_ptr0 = din_ptr1; din_ptr1 = din_ptr2; @@ -3794,10 +3689,9 @@ void conv_depthwise_5x5s1_bias_relu6(float* dout, din_ptr_arr[4] = din_ptr4; } // bottom - for (int h = 0; h < pad_bottom_new; h++) { + for (int h = 0; h < pad_bottom; h++) { compute_all_padding_post_relu6(dout_ptr0, din_ptr_arr, vbias, six, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 3 - h); + win, wout, pad_left, pad_right, cnt, remain, 3 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -3819,8 +3713,6 @@ inline void compute_all_padding_pre_leakyRelu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -3829,10 +3721,7 @@ inline void compute_all_padding_pre_leakyRelu(float* dout, #endif int tmp_index = num - 1; // left - for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[4], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp_index - k], weights[3 - k], 0.f, weights[5][3 - k], 4 - i); @@ -4068,7 +3957,7 @@ inline void compute_all_padding_pre_leakyRelu(float* dout, *dout++ = sum > 0.f ? sum : sum * scale[0]; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[4], bias[0], weights[4][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -4092,8 +3981,6 @@ inline void compute_all_padding_mid_leakyRelu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -4105,7 +3992,7 @@ inline void compute_all_padding_mid_leakyRelu(float* dout, *dout++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; } int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[tmp - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -4188,7 +4075,7 @@ inline void compute_all_padding_mid_leakyRelu(float* dout, } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[num]++; for (int k = 0; k < num; k++) { @@ -4197,9 +4084,6 @@ inline void compute_all_padding_mid_leakyRelu(float* dout, } *dout++ = sum > 0.f ? sum : sum * scale[0]; } - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } } inline void compute_all_padding_mid_leakyRelu_out2(float* dout0, float* dout1, @@ -4212,8 +4096,6 @@ inline void compute_all_padding_mid_leakyRelu_out2(float* dout0, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -4221,13 +4103,9 @@ inline void compute_all_padding_mid_leakyRelu_out2(float* dout0, float32x4_t vscale = vld1q_f32(scale); #endif // left - for (int w = pad_left; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - *dout1++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } int tmp = num - 1; int tmp1 = num + 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[num], weights[num], bias[0], weights[6][0], 4 - i); float sum1 = compute_one_data_pre(din_ptr_arr[tmp1], weights[num], bias[0], weights[6][0], 4 - i); for (int k = 0; k < num; k++) { @@ -4322,7 +4200,7 @@ inline void compute_all_padding_mid_leakyRelu_out2(float* dout0, *dout1++ = sum1 > 0.f ? sum1 : sum1 * scale[0]; } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[num], weights[num], bias[0], weights[num][3 - i], 3 - i); float sum1 = compute_one_data_post(din_ptr_arr[tmp1], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[tmp1]++; @@ -4335,10 +4213,6 @@ inline void compute_all_padding_mid_leakyRelu_out2(float* dout0, *dout0++ = sum > 0.f ? sum : sum * scale[0]; *dout1++ = sum1 > 0.f ? sum1 : sum1 * scale[0]; } - for (int w = pad_right; w > 4; w--) { - *dout0++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - *dout1++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } } inline void compute_all_padding_post_leakyRelu(float* dout, const float** din_ptr_arr, @@ -4350,8 +4224,6 @@ inline void compute_all_padding_post_leakyRelu(float* dout, int wout, int pad_left, int pad_right, - int pad_left_new, - int pad_right_new, int cnt, int remain, int num) { @@ -4359,11 +4231,8 @@ inline void compute_all_padding_post_leakyRelu(float* dout, float32x4_t vscale = vld1q_f32(scale); #endif // left - for (int w = pad_left; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } int tmp = num - 1; - for (int i = pad_left_new; i > 0; i--) { + for (int i = pad_left; i > 0; i--) { float sum = compute_one_data_pre(din_ptr_arr[3], weights[num], bias[0], weights[5][num], 4 - i); for (int k = 0; k < num; k++) { sum += compute_one_data_pre(din_ptr_arr[2 - k], weights[tmp - k], 0.f, weights[5][tmp - k], 4 - i); @@ -4597,7 +4466,7 @@ inline void compute_all_padding_post_leakyRelu(float* dout, } // right - for (int i = 0; i < pad_right_new; i++) { + for (int i = 0; i < pad_right; i++) { float sum = compute_one_data_post(din_ptr_arr[3], weights[num], bias[0], weights[num][3 - i], 3 - i); din_ptr_arr[3]++; for (int k = 0; k < num; k++) { @@ -4606,9 +4475,6 @@ inline void compute_all_padding_post_leakyRelu(float* dout, } *dout++ = sum > 0.f ? sum : sum * scale[0]; } - for (int w = pad_right; w > 4; w--) { - *dout++ = bias[0] > 0.f ? bias[0] : bias[0] * scale[0]; - } } void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, @@ -4634,10 +4500,6 @@ void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, int out_size = wout * hout; int cnt = loop_w >> 2; int remain = loop_w & 3; - int pad_left_new = pad_left > 4 ? 4 : pad_left; - int pad_right_new = pad_right > 4 ? 4 : pad_right; - int pad_top_new = pad_top > 4 ? 4 : pad_top; - int pad_bottom_new = pad_bottom > 4 ? 4 : pad_bottom; int in_channel_size = chin * in_size; int out_channel_size = chin * out_size; int weights_size = 25; @@ -4676,10 +4538,9 @@ void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, const float* din_ptr_arr[] = {din_ptr0, din_ptr1, din_ptr2, din_ptr3, din_ptr4, din_ptr5}; float32x4_t weights_vec[] = {wr0, wr1, wr2, wr3, wr4, wr5, wr6}; // top_h - for (int h = pad_top_new; h > 0; h--) { + for (int h = pad_top; h > 0; h--) { compute_all_padding_pre_leakyRelu(dout_ptr0, din_ptr_arr, vbias, scale, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4 - h); + win, wout, pad_left, pad_right, cnt, remain, 4 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; @@ -4691,8 +4552,7 @@ void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, // mid_h for (int h = 0; h < loop_h - 1; h += 2) { compute_all_padding_mid_leakyRelu_out2(dout_ptr0, dout_ptr1, din_ptr_arr, vbias, scale, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4); + win, wout, pad_left, pad_right, cnt, remain, 4); dout_ptr0 += num_out; dout_ptr1 += num_out; din_ptr0 = din_ptr2; @@ -4710,8 +4570,7 @@ void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, } if (loop_h % 2 != 0) { compute_all_padding_mid_leakyRelu(dout_ptr0, din_ptr_arr, vbias, scale, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 4); + win, wout, pad_left, pad_right, cnt, remain, 4); dout_ptr0 = dout_ptr1; din_ptr0 = din_ptr1; din_ptr1 = din_ptr2; @@ -4725,10 +4584,9 @@ void conv_depthwise_5x5s1_bias_leakyRelu(float* dout, din_ptr_arr[4] = din_ptr4; } // bottom - for (int h = 0; h < pad_bottom_new; h++) { + for (int h = 0; h < pad_bottom; h++) { compute_all_padding_post_leakyRelu(dout_ptr0, din_ptr_arr, vbias, scale, weights_vec, vzero, - win, wout, pad_left, pad_right, pad_left_new, - pad_right_new, cnt, remain, 3 - h); + win, wout, pad_left, pad_right, cnt, remain, 3 - h); dout_ptr0 += wout; din_ptr_arr[0] = din_ptr0; din_ptr_arr[1] = din_ptr1; diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 602edf4cfe..7233bfd964 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -736,6 +736,7 @@ void conv_depthwise_5x5_fp32(const void* din, bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; bool ch_four = ch_in > 4 * w_in; + bool pads_five = (pad_h < 5) || (pad_w < 5); ctx->ExtendWorkspace((w_in + w_out) * sizeof(float)); bool flag_act = act_param.has_active; if (stride == 2) { @@ -754,7 +755,7 @@ void conv_depthwise_5x5_fp32(const void* din, act_param, ctx); } else if (stride == 1) { - if (ch_four || h_in < 5 || w_in < 5) { + if (ch_four || !pads_five || h_in < 5 || w_in < 5) { conv_depthwise_5x5s1_fp32(reinterpret_cast(dout), reinterpret_cast(din), reinterpret_cast(weights), diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc index 1ad8bd599d..38437e002e 100644 --- a/lite/kernels/arm/conv_depthwise.cc +++ b/lite/kernels/arm/conv_depthwise.cc @@ -59,8 +59,9 @@ void DepthwiseConv::PrepareForRun() { #endif } else if (kw == 5) { // VLOG(5) << "invoke 5x5 dw conv fp32"; + bool pads_five = (paddings[0] < 5) || (paddings[2] < 5); auto strides = param.strides; - if (ch_four && win >= kw && hin >= kw && (strides[0] == 1 && strides[1] == 1)) { + if (ch_four && pads_five && win >= kw && hin >= kw && (strides[0] == 1 && strides[1] == 1)) { flag_trans_weights_ = false; impl_ = lite::arm::math::conv_depthwise_5x5_fp32; #ifdef LITE_WITH_PROFILE -- GitLab