diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 0ea1f2e8f5e1430e14ec7035dad0caa7b1d64a90..c3652217ededa10b57e211ba7f5d3dc76e235978 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -1132,6 +1132,11 @@ void pooling2x2s2p0_max(const float* din, float* dr_out = data_out_channel; auto dr0 = r0; auto dr1 = r1; + if (h * S + K - P > hin + 1) { + memset(dr_out, 0.f, sizeof(float) * wout); + data_out_channel += wout; + continue; + } if (h * S + K - P > hin) { dr1 = r0; } @@ -1164,7 +1169,7 @@ void pooling2x2s2p0_max(const float* din, int wstart = 0; for (int j = 0; j < w_unroll_remian; ++j) { int wend = std::min(wstart + K, rem); - float tmp = dr0[wstart]; + float tmp = wstart < rem ? dr0[wstart] : 0.f; for (int i = wstart; i < wend; i++) { tmp = std::max(tmp, dr0[i]); tmp = std::max(tmp, dr1[i]); @@ -1222,9 +1227,20 @@ void pooling2x2s2p0_avg(const float* din, float* dr_out = data_out_channel; auto dr0 = r0; auto dr1 = r1; + if (h * S + K - P > hin + 1) { + memset(dr_out, 0.f, sizeof(float) * wout); + data_out_channel += wout; + continue; + } if (h * S + K - P > hin) { dr1 = zero_ptr; - vcoef = vdupq_n_f32(0.5f); + if (exclusive) { + vcoef = vdupq_n_f32(0.5f); + } else { + if (pad_bottom == 0) { + vcoef = vdupq_n_f32(0.5f); + } + } } int cnt_num = w_unroll_size; if (w_unroll_size > 0) { @@ -1257,11 +1273,20 @@ void pooling2x2s2p0_avg(const float* din, int wend = std::min(wstart + K, rem); float coef = 0.25f; float tmp = 0.f; - if (wend - wstart == 1 && pad_right == 0) { - coef *= 2; - } - if (h * S + K - P > hin && pad_bottom == 0) { - coef *= 2; + if (exclusive) { + if (wend - wstart == 1) { + coef *= 2; + } + if (h * S + K - P > hin) { + coef *= 2; + } + } else { + if (wend - wstart == 1 && pad_right == 0) { + coef *= 2; + } + if (h * S + K - P > hin && pad_bottom == 0) { + coef *= 2; + } } for (int i = wstart; i < wend; i++) { tmp += dr0[i] + dr1[i]; @@ -1442,7 +1467,7 @@ void pooling2x2s2p1_avg(const float* din, } if (h * S + K - P > hin) { dr1 = zero_ptr; - if (exclusive) { + if (exclusive || pad_bottom == 0) { coef_h = 1.f; } if (h * S + K - P > hin + 1) { @@ -1490,8 +1515,14 @@ void pooling2x2s2p1_avg(const float* din, int st = wstart > 0 ? wstart : 0; float tmp = 0.f; float coef = coef_h / 2; - if (exclusive && wend - st == 1) { - coef = coef_h; + if (exclusive) { + if (wend - st == 1) { + coef = coef_h; + } + } else { + if (wend - st == 1 && wstart > 0 && pad_right == 0) { + coef = coef_h; + } } for (int i = 0; i < wend - st; i++) { tmp += dr0[i] + dr1[i]; @@ -2044,7 +2075,7 @@ void pooling3x3s1p0_avg(const float* din, } else { if (pad_bottom > 1) { coef_h = 1.f / 3; - } else if (pad_bottom = 1) { + } else if (pad_bottom == 1) { coef_h = 0.5f; } else { coef_h = 1.f; @@ -2538,7 +2569,10 @@ void pooling3x3s2p0_max(const float* din, int remain = w_unroll_remian - 1; int right = wout * 2 + 1 - win; // if need right pad - + int tmp_val = (w_unroll_size * 4 + remain) * S; + int wend = std::min(tmp_val + K, win) - tmp_val; + float minval = std::numeric_limits::lowest(); + remain = right > 0 ? remain : remain + 1; for (int n = 0; n < num; ++n) { float* data_out_batch = data_out + n * chout * size_channel_out; const float* data_in_batch = data_in + n * chin * size_channel_in; @@ -2589,82 +2623,59 @@ void pooling3x3s2p0_max(const float* din, "v9", "v10", "v11"); +#else + asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX + : [dr0] "+r"(dr0), + [dr1] "+r"(dr1), + [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num) + : + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11"); +#endif dr0 -= 8; dr1 -= 8; dr2 -= 8; - int rem = win - (w_unroll_size * 4) * S; - int wstart = 0; - for (int j = 0; j < w_unroll_remian; ++j) { - int wend = std::min(wstart + K, rem); - float tmp = dr0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, dr0[i]); - tmp = std::max(tmp, dr1[i]); - tmp = std::max(tmp, dr2[i]); - } - *(dr_out++) = tmp; - wstart += S; - } -#else - asm volatile( - P3x3S2P0_INIT P3x3S2P0_MAX - "cmp %[remain], #0 @cmp cnt_num\n" - "sub %[dr0], #32 @sub - 8\n" - "sub %[dr1], #32 @sub - 8\n" - "sub %[dr2], #32 @sub - 8\n" - "ble 4f @ble exit1\n" - "2: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load \n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load \n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load \n" - "vmov.f32 s3,s2 @mov \n" - "vmov.f32 s7,s6 @mov \n" - "vmov.f32 s11,s10 @mov \n" - "vmax.f32 q0, q0, q1 @max n" - "sub %[dr0], #8 @add w \n" - "sub %[dr1], #8 @add w \n" - "sub %[dr2], #8 @add w \n" - "vmax.f32 q0, q0, q2 @max \n" - "vpmax.f32 d0, d0, d1 @pmax \n" - "vpmax.f32 d0, d0, d0 @pmax \n" - "subs %[remain], #1 @subs \n" - "vst1.f32 d0[0], [%[dr_out]]! @vst \n" - "bne 2b @bne \n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [remain] "+r"(cnt_remain), - [cnt_num] "+r"(cnt_num) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); - if (right) { - int wstart = (w_unroll_size * 4 + remain) * S; - int wend = std::min(wstart + K, win); - float tmp = dr0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, std::max(dr0[i], dr1[i])); - tmp = std::max(tmp, dr2[i]); - } - *(dr_out++) = tmp; + } + for (int w = 0; w < remain; w += 1) { + float32x4_t vr0 = vld1q_f32(dr0); + float32x4_t vr1 = vld1q_f32(dr1); + float32x4_t vr2 = vld1q_f32(dr2); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + vr2 = vsetq_lane_f32(minval, vr2, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + vmax1 = vmaxq_f32(vmax1, vr2); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + float32x2_t vmax = vpmax_f32(vmax2, vmax2); + dr_out[0] = vget_lane_f32(vmax, 0); + dr_out++; + dr0 += 2; + dr1 += 2; + dr2 += 2; + } + if (right) { + float tmp = dr0[0]; // std::numeric_limits::min(); + for (int i = 0; i < wend; i++) { + tmp = std::max(tmp, std::max(dr0[i], dr1[i])); + tmp = std::max(tmp, dr2[i]); } -#endif + *(dr_out++) = tmp; } - r0 = r2; r1 = r0 + win; r2 = r1 + win; @@ -2815,6 +2826,7 @@ void pooling3x3s2p0_avg(const float* din, dr2 -= 8; } // deal with right pad + w_unroll_size = w_unroll_size < 0 ? 0 : w_unroll_size; int wstart = w_unroll_size * 4 * S - P; for (int j = 0; j < w_unroll_remian; ++j) { int wend = wstart + K; // std::min(wstart + K, win); diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc index 5cfca8f1b7d9a286d24dda5af5664aa381c8e0f1..6865d65b1f46d517c35728f9ecc3c2ff10565e2a 100644 --- a/lite/kernels/arm/pool_compute.cc +++ b/lite/kernels/arm/pool_compute.cc @@ -57,6 +57,8 @@ void PoolCompute::Run() { (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_less; bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) && (ksize[1] == in_dims[3]) && kps_equal && pads_equal; + bool win_ksize = (in_dims[2] > ksize[0]) && (in_dims[3] > ksize[1]); + kps_equal = kps_equal && win_ksize; global_pooling = param.global_pooling || global_pooling; if (global_pooling) { diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc index e0d4de61747d5772edd94f7ad66cfe99e8cf0457..890973a0dfbb47d3943e7c463ef6fc7092fd367f 100644 --- a/lite/tests/math/pool_compute_test.cc +++ b/lite/tests/math/pool_compute_test.cc @@ -435,7 +435,7 @@ TEST(TestPoolRand, test_pool_rand) { adaptive, use_quantizer, pooling_type, - {1, 2, 4}, + {4}, {FLAGS_power_mode}); } }