diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index cde58fb77b49f06187c24298229395532a60857e..651bfb06916e15e7f95212a82f16619ae8f16d6a 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -206,19 +206,19 @@ void pooling_basic(const float* din, "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ -#define P2x2S2P1_MAX \ - "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \ - "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \ - "sub %[dr0], %[dr0], #4\n" /* sub */ \ - "sub %[dr1], %[dr1], #4\n" /* sub */ \ - "fmax v4.4s, v0.4s, v6.4s\n" /* max */ \ - "fmax v5.4s, v2.4s, v8.4s\n" /* max */ \ - "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ - "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \ - "fmax v6.4s, v4.4s, v5.4s\n" /* max reduce */ \ - "subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \ - "st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ - "ble 2f\n" /* bne s3_max_loop_mid */ +#define P2x2S2P1_MAX \ + "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \ + "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \ + "sub %[dr0], %[dr0], #4\n" /* sub */ \ + "sub %[dr1], %[dr1], #4\n" /* sub */ \ + "fmax v4.4s, v0.4s, v6.4s\n" /* max */ \ + "fmax v5.4s, v2.4s, v8.4s\n" /* max */ \ + "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ + "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \ + "fmax v6.4s, v4.4s, v5.4s\n" /* max reduce */ \ + "subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \ + "st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ + "ble 2f\n" /* bne s3_max_loop_mid */ #define P2x2S2P0_MAX \ "1: \n" \ @@ -231,20 +231,20 @@ void pooling_basic(const float* din, "st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ "bne 1b\n" /* bne s3_max_loop_mid */ -#define P2x2S2P1_AVG \ - "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \ - "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \ - "sub %[dr0], %[dr0], #4\n" /* sub */ \ - "sub %[dr1], %[dr1], #4\n" /* sub */ \ - "fadd v4.4s, v0.4s, v6.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \ - "fadd v5.4s, v2.4s, v8.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \ - "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ - "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \ - "fadd v6.4s, v4.4s, v5.4s\n" /* add reduce */ \ - "subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \ - "fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */ \ - "st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ - "ble 2f\n" /* bne s3_max_loop_mid */ +#define P2x2S2P1_AVG \ + "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \ + "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \ + "sub %[dr0], %[dr0], #4\n" /* sub */ \ + "sub %[dr1], %[dr1], #4\n" /* sub */ \ + "fadd v4.4s, v0.4s, v6.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \ + "fadd v5.4s, v2.4s, v8.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \ + "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ + "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \ + "fadd v6.4s, v4.4s, v5.4s\n" /* add reduce */ \ + "subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \ + "fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */ \ + "st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ + "ble 2f\n" /* bne s3_max_loop_mid */ #define P2x2S2P0_AVG \ "1: \n" /* load bias to q2, q3*/ \ @@ -549,8 +549,8 @@ void pooling_basic(const float* din, "vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" #define P2x2S2P1_MAX \ - "vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \ - "vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \ + "vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \ + "vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \ "sub %[dr0], #4 @sub \n" \ "sub %[dr1], #4 @sub \n" \ "vmax.f32 q8, q0, q4 @ max \n" \ @@ -558,7 +558,7 @@ void pooling_basic(const float* din, "vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \ "vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \ "vmax.f32 q5, q9, q8 @ max reduce\n" \ - "subs %[cnt_num], #1 @ subs cnt_num \n" \ + "subs %[cnt_num], #1 @ subs cnt_num \n" \ "vst1.f32 {d10-d11}, [%[dr_out]]! @ store 4 out \n" \ "ble 2f @ bne \n" @@ -574,8 +574,8 @@ void pooling_basic(const float* din, "bne 1b @ bne \n" #define P2x2S2P1_AVG \ - "vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \ - "vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \ + "vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \ + "vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \ "sub %[dr0], #4 @sub \n" \ "sub %[dr1], #4 @sub \n" \ "vadd.f32 q9, q0, q4 @ max \n" \ @@ -583,9 +583,9 @@ void pooling_basic(const float* din, "vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \ "vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \ "vadd.f32 q5, q9, q8 @ max reduce\n" \ - "subs %[cnt_num], %[cnt_num], #1 @ subs cnt_num \n" \ - "vmul.f32 q4, q5, %q[vcoef_left] @ mul coef \n" \ - "vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \ + "subs %[cnt_num], #1 @ subs cnt_num \n" \ + "vmul.f32 q4, q5, %q[vcoef_left] @ mul coef \n" \ + "vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \ "ble 2f @ bne\n" #define P2x2S2P0_AVG \ @@ -1097,16 +1097,16 @@ void pooling1x1s2p0_max(const float* din, } void pooling2x2s2p0_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - int pad_bottom, - int pad_right) { + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1181,17 +1181,17 @@ void pooling2x2s2p0_max(const float* din, } void pooling2x2s2p0_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive, - int pad_bottom, - int pad_right) { + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1258,10 +1258,10 @@ void pooling2x2s2p0_avg(const float* din, float coef = 0.25f; float tmp = 0.f; if (wend - wstart == 1 && pad_right == 0) { - coef *= 2; + coef *= 2; } if (h * S + K - P > hin && pad_bottom == 0) { - coef *= 2; + coef *= 2; } for (int i = wstart; i < wend; i++) { tmp += dr0[i] + dr1[i]; @@ -1280,16 +1280,16 @@ void pooling2x2s2p0_avg(const float* din, } void pooling2x2s2p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - int pad_bottom, - int pad_right) { + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1321,19 +1321,19 @@ void pooling2x2s2p1_max(const float* din, auto dr0 = r0; auto dr1 = r1; if ( h == 0 ) { - dr0 = r0; - dr1 = r0; - r0 = r1; - r1 = r0 + win; + dr0 = r0; + dr1 = r0; + r0 = r1; + r1 = r0 + win; } else { r0 = r1 + win; - r1 = r0 + win; + r1 = r0 + win; } if (h * S + K - P > hin) { dr1 = dr0; if (h * S + K - P > hin + 1) { - memset(dr_out, 0, wout * sizeof(float)); - continue; + memset(dr_out, 0, wout * sizeof(float)); + continue; } } int cnt_num = w_unroll_size; @@ -1345,19 +1345,16 @@ void pooling2x2s2p1_max(const float* din, [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) - : [vzero] "w" (vzero) + : [vzero] "w"(vzero) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8"); -#else - // cnt_num -= 1; +#else asm volatile( - P2x2S2_INIT - P2x2S2P1_MAX - P2x2S2P0_MAX "2: \n" /* end */ + P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */ : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) - : [vzero] "w" (vzero) + : [vzero] "w"(vzero) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9"); #endif dr0 -= 8; @@ -1385,17 +1382,17 @@ void pooling2x2s2p1_max(const float* din, } void pooling2x2s2p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive, - int pad_bottom, - int pad_right) { + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1432,12 +1429,12 @@ void pooling2x2s2p1_avg(const float* din, auto dr1 = r1; float coef_h = 0.5f; if ( h == 0 ) { - dr0 = zero_ptr; - dr1 = r0; - r0 = r1; - r1 = r0 + win; - if (exclusive) { - coef_h = 1.f; + dr0 = zero_ptr; + dr1 = r0; + r0 = r1; + r1 = r0 + win; + if (exclusive) { + coef_h = 1.f; } } else { r0 = r1 + win; @@ -1446,16 +1443,17 @@ void pooling2x2s2p1_avg(const float* din, if (h * S + K - P > hin) { dr1 = zero_ptr; if (exclusive) { - coef_h = 1.f; + coef_h = 1.f; } if (h * S + K - P > hin + 1) { - memset(dr_out, 0, wout * sizeof(float)); - continue; + memset(dr_out, 0, wout * sizeof(float)); + continue; } } float coef_left_most = exclusive ? coef_h : coef_h / 2; float32x4_t vcoef = vdupq_n_f32(coef_h / 2); - float coef_left[4] = {coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2}; + float coef_left[4] = + {coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2}; float32x4_t vcoef_left = vld1q_f32(coef_left); int cnt_num = w_unroll_size; if (w_unroll_size > 0) { @@ -1466,16 +1464,20 @@ void pooling2x2s2p1_avg(const float* din, [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) - : [vcoef] "w"(vcoef), [vzero] "w"(vzero), [vcoef_left] "w"(vcoef_left) + : [vcoef] "w"(vcoef), + [vzero] "w"(vzero), + [vcoef_left] "w"(vcoef_left) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8"); #else asm volatile( - P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n" + P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n" : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) - : [vcoef] "w"(vcoef), [vzero] "w"(vzero), [vcoef_left] "w"(vcoef_left) + : [vcoef] "w"(vcoef), + [vzero] "w"(vzero), + [vcoef_left] "w"(vcoef_left) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9"); #endif dr0 -= 8; @@ -1489,7 +1491,7 @@ void pooling2x2s2p1_avg(const float* din, float tmp = 0.f; float coef = coef_h / 2; if (exclusive && wend - st == 1) { - coef = coef_h; + coef = coef_h; } for (int i = 0; i < wend - st; i++) { tmp += dr0[i] + dr1[i]; @@ -2535,7 +2537,7 @@ void pooling3x3s2p0_max(const float* din, } int remain = w_unroll_remian - 1; - int right = wout * 2 + 1 - win; // if need right pad + int right = wout * 2 + 1 - win; // if need right pad for (int n = 0; n < num; ++n) { float* data_out_batch = data_out + n * chout * size_channel_out; @@ -2571,7 +2573,6 @@ void pooling3x3s2p0_max(const float* din, [dr1] "+r"(dr1), [dr2] "+r"(dr2), [dr_out] "+r"(dr_out), - [remain] "+r" (cnt_remain), [cnt_num] "+r"(cnt_num) : : "cc", @@ -2594,76 +2595,73 @@ void pooling3x3s2p0_max(const float* din, int rem = win - (w_unroll_size * 4) * S; int wstart = 0; for (int j = 0; j < w_unroll_remian; ++j) { - int wend = std::min(wstart + K, rem); - float tmp = dr0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, dr0[i]); - tmp = std::max(tmp, dr1[i]); - tmp = std::max(tmp, dr2[i]); + int wend = std::min(wstart + K, rem); + float tmp = dr0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { + tmp = std::max(tmp, dr0[i]); + tmp = std::max(tmp, dr1[i]); + tmp = std::max(tmp, dr2[i]); + } + *(dr_out++) = tmp; + wstart += S; } - *(dr_out++) = tmp; - wstart += S; - } #else asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX - "cmp %[remain], #0 @cmp cnt_num, 0\n" - "sub %[dr0], #32 @sub - 8\n" - "sub %[dr1], #32 @sub - 8\n" - "sub %[dr2], #32 @sub - 8\n" - "ble 4f @ble exit1\n" - "2: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, dr1\n" - "vmov.f32 s3,s2 @movs3, s2\n" - "vmov.f32 s7,s6 @movs7, s6\n" - "vmov.f32 s11,s10 @movs11, s10\n" - "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" - "sub %[dr0], #8 @add w, 6\n" - "sub %[dr1], #8 @add w, 6\n" - "sub %[dr2], #8 @add w, 6\n" - "vmax.f32 q0, q0, q2 @max q0, q0, q2\n" - "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" - "subs %[remain], #1 @subs cnt_num, #1\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" - "bne 2b @bne s3_max_loop_mid_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [remain] "+r" (cnt_remain), - [cnt_num] "+r"(cnt_num) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -#endif - // dr0 -= 8; - // dr1 -= 8; - // dr2 -= 8; - if (right){ + "cmp %[remain], #0 @cmp cnt_num, 0\n" + "sub %[dr0], #32 @sub - 8\n" + "sub %[dr1], #32 @sub - 8\n" + "sub %[dr2], #32 @sub - 8\n" + "ble 4f @ble exit1\n" + "2: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmov.f32 s11,s10 @movs11, s10\n" + "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "vmax.f32 q0, q0, q2 @max q0, q0, q2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "subs %[remain], #1 @subs cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "bne 2b @bne s3_max_loop_mid_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), + [dr1] "+r"(dr1), + [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), + [remain] "+r"(cnt_remain), + [cnt_num] "+r"(cnt_num) + : + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11"); + if (right) { int wstart = (w_unroll_size * 4 + remain) * S; int wend = std::min(wstart + K, win); - float tmp = dr0[wstart];//std::numeric_limits::min(); - for(int i = wstart; i < wend; i++){ - tmp = std::max(tmp,std::max(dr0[i],dr1[i])); - tmp = std::max(tmp,dr2[i]); + float tmp = dr0[wstart]; //std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { + tmp = std::max(tmp, std::max(dr0[i], dr1[i])); + tmp = std::max(tmp, dr2[i]); } *(dr_out++) = tmp; } +#endif } r0 = r2; diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h index 7d75bab5ddbc0aa08c7e1059379c49921dc2cabd..572919e3f083f736d8f49b3bae0dd2820fac35c4 100644 --- a/lite/backends/arm/math/pooling.h +++ b/lite/backends/arm/math/pooling.h @@ -77,54 +77,54 @@ void pooling1x1s2p0_max(const float* din, int pad_right); void pooling2x2s2p0_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - int pad_bottom, - int pad_right); + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + int pad_bottom, + int pad_right); void pooling2x2s2p0_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive, - int pad_bottom, - int pad_right); + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + bool exclusive, + int pad_bottom, + int pad_right); void pooling2x2s2p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - int pad_bottom, - int pad_right); + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + int pad_bottom, + int pad_right); void pooling2x2s2p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive, - int pad_bottom, - int pad_right); + float* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s1p1_max(const float* din, float* dout, diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp index c8ea65fce4e02ad553daba79a019d9055d1128f5..2bc4f91f1d8c76b243a0ffb4a083f8d6ab138553 100644 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -70,8 +70,7 @@ class PoolingPE : public PE { param_.poolingArgs = args; // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 - // && - // (k_width > 7 || k_height > 7); + // && (k_width > 7 || k_height > 7); use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && (k_width > 255 || k_height > 255); // use_cpu_ = param_.type == AVERAGE; diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc index 7a383f297c49600d44f5ac83badb85fd2afb69b3..5cfca8f1b7d9a286d24dda5af5664aa381c8e0f1 100644 --- a/lite/kernels/arm/pool_compute.cc +++ b/lite/kernels/arm/pool_compute.cc @@ -109,60 +109,60 @@ void PoolCompute::Run() { kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling2x2s2p0_max(din, - dout, - out_dims[0], - out_dims[1], - out_dims[2], - out_dims[3], - in_dims[1], - in_dims[2], - in_dims[3], - paddings[1], - paddings[3]); + dout, + out_dims[0], + out_dims[1], + out_dims[2], + out_dims[3], + in_dims[1], + in_dims[2], + in_dims[3], + paddings[1], + paddings[3]); return; } else if (pooling_type == "avg") { lite::arm::math::pooling2x2s2p0_avg(din, - dout, - out_dims[0], - out_dims[1], - out_dims[2], - out_dims[3], - in_dims[1], - in_dims[2], - in_dims[3], - exclusive, - paddings[1], - paddings[3]); + dout, + out_dims[0], + out_dims[1], + out_dims[2], + out_dims[3], + in_dims[1], + in_dims[2], + in_dims[3], + exclusive, + paddings[1], + paddings[3]); return; } } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 1 && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling2x2s2p1_max(din, - dout, - out_dims[0], - out_dims[1], - out_dims[2], - out_dims[3], - in_dims[1], - in_dims[2], - in_dims[3], - paddings[1], - paddings[3]); + dout, + out_dims[0], + out_dims[1], + out_dims[2], + out_dims[3], + in_dims[1], + in_dims[2], + in_dims[3], + paddings[1], + paddings[3]); return; } else if (pooling_type == "avg") { lite::arm::math::pooling2x2s2p1_avg(din, - dout, - out_dims[0], - out_dims[1], - out_dims[2], - out_dims[3], - in_dims[1], - in_dims[2], - in_dims[3], - exclusive, - paddings[1], - paddings[3]); + dout, + out_dims[0], + out_dims[1], + out_dims[2], + out_dims[3], + in_dims[1], + in_dims[2], + in_dims[3], + exclusive, + paddings[1], + paddings[3]); return; } } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 && diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index aba5815741734342a367d3316de14cdae6b1c896..51f67a1c6f0122c1140aeb762b448a928bd16692 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -64,20 +64,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { return FAILED; } - auto padding = op_info->GetAttr>("paddings"); - bool pads_equal = (padding[0] == padding[1]) && (padding[2] == padding[3]); - if (!pads_equal) { - LOG(FATAL) - << "padding requires pad_left == pad_right, pad_top == pad_bottom"; - } - auto npu_pad = - ge::AttrValue::LIST_INT{padding[0], padding[1], padding[2], padding[3]}; - auto strides = op_info->GetAttr>("strides"); - auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end()); - int npu_ceil_mode = 0; - if (op_info->HasAttr("ceil_mode")) { - npu_ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; - // pad mode int pad_mode = 0; std::string padding_algorithm(""); diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc deleted file mode 100644 index 7efc6b464c00c945c71c8c5689e18823cde10f97..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/pool_op_test.cc +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pool_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -void pool_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto& in_dims = x->dims(); - auto& out_dims = out->dims(); - - const float* src_ptr = x->data(); - float* dst_ptr = out->mutable_data(); - - std::vector ksize = op_info->GetAttr>("ksize"); - std::vector strides = op_info->GetAttr>("strides"); - std::vector paddings = op_info->GetAttr>("paddings"); - bool exclusive = op_info->GetAttr("exclusive"); - std::string pooling_type = op_info->GetAttr("pooling_type"); - bool global_pooling = op_info->GetAttr("global_pooling"); - - int in_n = in_dims[0]; - int in_c = in_dims[1]; - int in_h = in_dims[2]; - int in_w = in_dims[3]; - int size_in_n = in_c * in_h * in_w; - int size_in_c = in_h * in_w; - - int out_h = out_dims[2]; - int out_w = out_dims[3]; - int size_out_n = in_c * out_h * out_w; - int size_out_c = out_h * out_w; - - int window_h = ksize[0]; - int window_w = ksize[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - - if (global_pooling == true) { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - const float* src = src_ptr + n * size_in_n + c * size_in_c; - float res = src[0]; - if (pooling_type == "max") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res = cur_val > res ? cur_val : res; - } - } else if (pooling_type == "avg") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res += cur_val; - } - res /= size_in_c; - } - dst_ptr[n * size_out_n + c] = res; - } - } - } else { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - for (int h = 0; h < out_h; ++h) { - int sh = h * stride_h; - int eh = sh + window_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; - for (int w = 0; w < out_w; ++w) { - int sw = w * stride_w; - int ew = sw + window_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; - int pooling_size = (ew - sw) * (eh - sh); - if (pooling_size == 0) continue; - float res = 0.f; - for (int kh = sh; kh < eh; ++kh) { - for (int kw = sw; kw < ew; ++kw) { - int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; - if (kh == sh && kw == sw) { - res = src_ptr[src_idx]; - } else { - if (pooling_type == "max") { - res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; - } - if (pooling_type == "avg") { - res += src_ptr[src_idx]; - } - } - } - } - if (pooling_type == "avg") { - if (exclusive) { - res /= pooling_size; - } else { - res /= window_h * window_w; - } - } - dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; - } - } - } - } - } -} - -void test_pool(int bs, - int ic, - int ih, - int iw, - std::string pooling_type, - bool ceil_mode, - bool global_pooling, - bool exclusive, - int ksize, - int stride, - int padding) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("pool2d"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("pooling_type", pooling_type); - opdesc.SetAttr("ksize", std::vector({ksize, ksize})); - opdesc.SetAttr("global_pooling", global_pooling); - opdesc.SetAttr("exclusive", exclusive); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", - std::vector({padding, padding, padding, padding})); - opdesc.SetAttr("ceil_mode", ceil_mode); - - // create and convert op to XPU model, then run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - pool_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(XPUBridges, pool) { - for (auto pooling_type : {"max", "avg"}) { - for (auto bs : {1, 3}) { - for (auto ic : {2}) { - for (auto ih : {3}) { - for (auto iw : {4}) { - test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0); - } - } - } - } - } - - for (auto pooling_type : {"max"}) { - for (auto ceil_mode : {true, false}) { - for (auto ksize : {2, 3}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1}) { - for (auto bs : {1, 3}) { - for (auto ic : {2}) { - for (auto ih : {3}) { - for (auto iw : {4}) { - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - false, - true, - ksize, - stride, - padding); - } - } - } - } - } - } - } - } - } - - for (auto pooling_type : {"avg"}) { - for (auto ceil_mode : {true, false}) { - for (auto exclusive : {true, false}) { - for (auto ksize : {2, 3}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1}) { - for (auto bs : {1, 3}) { - for (auto ic : {2}) { - for (auto ih : {3}) { - for (auto iw : {4}) { - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - false, - exclusive, - ksize, - stride, - padding); - } - } - } - } - } - } - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(pool2d); -USE_XPU_BRIDGE(pool2d); diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc index 7048e15b82b0153c9112c20a498ec5725dee7929..5fb990928ec1ae723bc12b695af1be5e50da5079 100644 --- a/lite/operators/pool_op.cc +++ b/lite/operators/pool_op.cc @@ -41,39 +41,6 @@ bool PoolOpLite::CheckShape() const { return true; } -inline void UpdatePadding(std::vector* paddings, - const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const lite::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < strides.size(); ++i) { - int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; - int pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], - (int64_t)0); - int pad_0 = pad_sum / 2; - int pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - int PoolOutputSize(int input_size, int filter_size, int pad_left,