diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h index 35fb09f94c49cd91915128260b6426fe0fedf725..37479c22efe95b6506054cf3ded5855aa766c34c 100644 --- a/src/operators/kernel/central-arm-func/pool_arm_func.h +++ b/src/operators/kernel/central-arm-func/pool_arm_func.h @@ -76,15 +76,17 @@ void PoolCompute(const PoolParam ¶m) { } } - } else if (ksize[0] == 2 && ksize[0] == ksize[1]) { + } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 && + strides[0] == strides[1] && paddings[0] == paddings[1] && + paddings[1] == 0) { #if __ARM_NEON #if __aarch64__ PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); #else if (pooling_type == "max") { - math::Pool2x2Max(strides, paddings, in_x, out); + math::Pool2x2Maxs2p0(strides, paddings, in_x, out); } else if (pooling_type == "avg") { - math::Pool2x2Avg(strides, paddings, in_x, out); + math::Pool2x2Avgs2p0(strides, paddings, in_x, out); } #endif #else diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp index 0a2d96d4d065d7938e6872b4f073e080d7be8c3a..76af743818edacac6dd9e1878e8d8220ccff6d73 100644 --- a/src/operators/math/pool_2x2.cpp +++ b/src/operators/math/pool_2x2.cpp @@ -20,21 +20,15 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { namespace math { +#define FLT_MAX __FLT_MAX__ -void Pool2x2Max(vector strides, vector paddings, const Tensor *input, - Tensor *output) { -#if __ARM_NEON - -#if __aarch64__ -#else +void Pool2x2Maxs2p0(vector strides, vector paddings, + const Tensor *input, Tensor *output) { const int batch_size = input->dims()[0]; - const int input_height = input->dims()[2]; - const int input_width = input->dims()[3]; const int output_channels = output->dims()[1]; - int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const int ksize_height = 2; @@ -47,72 +41,110 @@ void Pool2x2Max(vector strides, vector paddings, const Tensor *input, const int input_channel_stride = input_height * input_width; const int output_channel_stride = output_height * output_width; + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + const float *input_data = input->data(); float *output_data = output->mutable_data(); - int out_w_num = output_width >> 2; - const int in_h_num = output_height >> 1; - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - int remain = output_width - out_w_num << 2; + int w1 = input_width / 16; + int _w1 = input_width % 16; + int w2 = _w1 / 4; + int _w2 = _w1 % 4; + for (int i = 0; i < batch_size; ++i) { for (int c = 0; c < output_channels; ++c) { - const float *input_data_chanel_row_next = input_data + input_width; - for (; output_height > 0; output_height--) { - if (out_w_num > 0) { - asm volatile( - "max_loop: \n\t" - "vld1.f32 {q0,q1}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q2,q3}, [%[in_ptr2]]! \n\t" - "vmax.f32 q0, q0, q2 \n\t" - "vmax.f32 q1, q1, q3 \n\t" - "vpmax.f32 d4, d0, d1 \n\t" - "vpmax.f32 d5, d2, d3 \n\t" - "subs %[out_w_num], #1 \n\t" - "vst1.32 {q2}, [%[out_ptr]]! \n\t" - "bne max_loop \n\t" - : [in_ptr1] "+r"(input_data), - [in_ptr2] "+r"(input_data_chanel_row_next), - [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num) - : - : "memory", "q0", "q1", "q2", "q3"); + for (int ph = 0; ph < input_height; ph += 2) { + const float *in_ptr1 = input_data + i * input_batch_stride + + c * input_channel_stride + ph * input_width; + const float *in_ptr2 = in_ptr1 + input_width; + if (ph + 1 >= input_height) { + in_ptr2 = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * input_width)); + memset(static_cast(const_cast(in_ptr2)), -FLT_MAX, + sizeof(float) * input_width); } + float *out_ptr = output_data + i * output_batch_stride + + c * output_channel_stride + ph / 2 * output_width; + asm volatile( + "subs %[w1], %[w1], #1 \n\t" + "blt end_w1_%= \n\t" + "loop_w1_%=: \n\t" + + "pld [%[in_ptr1], #64] \n\t" + "pld [%[in_ptr2], #64] \n\t" + + "vld1.f32 {q0, q1}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q2, q3}, [%[in_ptr2]]! \n\t" + "vld1.f32 {q6, q7}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q8, q9}, [%[in_ptr2]]! \n\t" - for (; remain > 0; remain--) { - float max_row1 = std::max(input_data[0], input_data[1]); - float max_row2 = std::max(input_data_chanel_row_next[0], - input_data_chanel_row_next[1]); - *output_data = std::max(max_row1, max_row2); - input_data += 2; - input_data_chanel_row_next += 2; - output_data++; + "vmax.f32 q0, q0, q2 \n\t" + "vmax.f32 q1, q1, q3 \n\t" + + "vmax.f32 q6, q6, q8 \n\t" + "vmax.f32 q7, q7, q9 \n\t" + + "vpmax.f32 d8, d0, d1 \n\t" + "vpmax.f32 d9, d2, d3 \n\t" + + "vpmax.f32 d10, d12, d13 \n\t" + "vpmax.f32 d11, d14, d15 \n\t" + + "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + + "subs %[w1], %[w1], #1 \n\t" + "bge loop_w1_%= \n\t" + "end_w1_%=: \n\t" + + "subs %[w2], %[w2], #1 \n\t" + "blt end_w2_%= \n\t" + "loop_w2_%=: \n\t" + + "vld1.f32 {q0}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q1}, [%[in_ptr2]]! \n\t" + "vmax.f32 q0, q0, q1 \n\t" + "vpmax.f32 d4, d0, d1 \n\t" + "vst1.32 {d4}, [%[out_ptr]]! \n\t" + + "subs %[w2], %[w2], #1 \n\t" + "bge loop_w2_%= \n\t" + "end_w2_%=: \n\t" + : + : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1), + [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9"); + + if (_w2 != 0) { + in_ptr1 += 16 * w1 + 4 * w2; + in_ptr2 += 16 * w1 + 4 * w2; + out_ptr += 8 * w1 + 2 * w2; + if (_w2 == 1) { + *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + } else if (_w2 == 2) { + float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; + float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + *out_ptr = (temp > temp1) ? temp : temp1; + } else if (_w2 == 3) { + float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; + float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; + *out_ptr++ = (temp > temp1) ? temp : temp1; + *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + } } } - input_data += input_channel_stride; - output_data += output_channel_stride; } - input_data += input_batch_stride; - output_data += output_batch_stride; } -#endif -#else -#endif } -void Pool2x2Avg(vector strides, vector paddings, const Tensor *input, - Tensor *output) { -#if __ARM_NEON - -#if __aarch64__ -#else +void Pool2x2Avgs2p0(vector strides, vector paddings, + const Tensor *input, Tensor *output) { const int batch_size = input->dims()[0]; - const int input_height = input->dims()[2]; - const int input_width = input->dims()[3]; const int output_channels = output->dims()[1]; - int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const int ksize_height = 2; @@ -125,59 +157,114 @@ void Pool2x2Avg(vector strides, vector paddings, const Tensor *input, const int input_channel_stride = input_height * input_width; const int output_channel_stride = output_height * output_width; + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + const float *input_data = input->data(); float *output_data = output->mutable_data(); - int out_w_num = output_width >> 2; - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f}; - int remain = output_width - out_w_num << 2; + int w1 = input_width / 16; + int _w1 = input_width % 16; + int w2 = _w1 / 4; + int _w2 = _w1 % 4; + + float quarter = 1 / 4; for (int i = 0; i < batch_size; ++i) { for (int c = 0; c < output_channels; ++c) { - const float *input_data_chanel_row_next = input_data + input_width; - for (; output_height > 0; output_height--) { - if (out_w_num > 0) { - asm volatile( - "avg_loop: \n\t" - "vld1.32 {q0,q1}, [%[in_ptr1]]! \n\t" - "vld1.32 {q2,q3}, [%[in_ptr2]]! \n\t" - "vadd.f32 q0, q0, q2 \n\t" - "vadd.f32 q1, q1, q3 \n\t" - "vpadd.f32 d4, d0, d1 \n\t" - "vpadd.f32 d5, d2, d3 \n\t" - "vld1.32 {q4}, [%[vqua]]! \n\t" - "vmul.f32 q2, q2, q4 \n\t" - "subs %[out_w_num], #1 \n\t" - "vst1.32 {q2}, [%[out_ptr]]! \n\t" - "bne avg_loop \n\t" - : [in_ptr1] "+r"(input_data), - [in_ptr2] "+r"(input_data_chanel_row_next), - [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num) - : [vqua] "r"(vqua) - : "memory", "q0", "q1", "q2", "q3", "q4"); + for (int ph = 0; ph < input_height; ph += 2) { + const float *in_ptr1 = input_data + i * input_batch_stride + + c * input_channel_stride + ph * input_width; + const float *in_ptr2 = in_ptr1 + input_width; + if (ph + 1 >= input_height) { + in_ptr2 = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * input_width)); + memset(static_cast(const_cast(in_ptr2)), 0, + sizeof(float) * input_width); } + float *out_ptr = output_data + i * output_batch_stride + + c * output_channel_stride + ph / 2 * output_width; + asm volatile( + "subs %[w1], %[w1], #1 \n\t" + "blt end_w1_%= \n\t" + "loop_w1_%=: \n\t" + + "pld [%[in_ptr1], #64] \n\t" + "pld [%[in_ptr2], #64] \n\t" + + "vmov.f32 d0[0], %[quarter] \n\t" + "vld1.f32 {q1, q2}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q3, q4}, [%[in_ptr2]]! \n\t" + "vld1.f32 {q7, q8}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q9, q10}, [%[in_ptr2]]! \n\t" + + "vadd.f32 q1, q1, q3 \n\t" + "vadd.f32 q2, q2, q4 \n\t" - for (; remain > 0; remain--) { - float max_row1 = std::max(input_data[0], input_data[1]); - float max_row2 = std::max(input_data_chanel_row_next[0], - input_data_chanel_row_next[1]); - *output_data = std::max(max_row1, max_row2); - input_data += 2; - input_data_chanel_row_next += 2; - output_data++; + "vadd.f32 q7, q7, q9 \n\t" + "vadd.f32 q8, q8, q10 \n\t" + + "vpadd.f32 d10, d2, d3 \n\t" + "vpadd.f32 d11, d4, d5 \n\t" + + "vpadd.f32 d12, d14, d15 \n\t" + "vpadd.f32 d13, d16, d17 \n\t" + + "vmul.f32 q5, q5, d0[0] \n\t" + "vmul.f32 q6, q6, d0[0] \n\t" + + "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" + + "subs %[w1], %[w1], #1 \n\t" + "bge loop_w1_%= \n\t" + "end_w1_%=: \n\t" + + "subs %[w2], %[w2], #1 \n\t" + "blt end_w2_%= \n\t" + "loop_w2_%=: \n\t" + + "vld1.f32 {q1}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q2}, [%[in_ptr2]]! \n\t" + "vadd.f32 q1, q1, q2 \n\t" + "vpadd.f32 d4, d2, d3 \n\t" + "vmul.f32 d4, d4, d0[0] \n\t" + "vst1.32 {d4}, [%[out_ptr]]! \n\t" + + "subs %[w2], %[w2], #1 \n\t" + "bge loop_w2_%= \n\t" + "end_w2_%=: \n\t" + : + : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1), + [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr), + [quarter] "r"(quarter) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10"); + + if (_w2 != 0) { + in_ptr1 += 16 * w1 + 4 * w2; + in_ptr2 += 16 * w1 + 4 * w2; + out_ptr += 8 * w1 + 2 * w2; + if (_w2 == 1) { + *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); + } else if (_w2 == 2) { + float temp = 0; + temp += *in_ptr1++; + temp += *in_ptr2++; + temp += *in_ptr1; + temp += *in_ptr2; + *out_ptr = 0.5 * temp; + } else if (_w2 == 3) { + float temp = 0; + temp += *in_ptr1++; + temp += *in_ptr2++; + temp += *in_ptr1++; + temp += *in_ptr2++; + *out_ptr++ = 0.5 * temp; + *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); + } } } - input_data += input_channel_stride; - output_data += output_channel_stride; } - input_data += input_batch_stride; - output_data += output_batch_stride; } - -#endif -#else -#endif } //} diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h index ae32a3912b677efb50d8558700741a225e3eb3f8..bd5e48482607cc868408b6371f47e0cb55caf499 100644 --- a/src/operators/math/pool_2x2.h +++ b/src/operators/math/pool_2x2.h @@ -26,11 +26,11 @@ namespace math { using framework::Tensor; using std::vector; -void Pool2x2Max(vector strides, vector paddings, const Tensor *input, - Tensor *output); +void Pool2x2Maxs2p0(vector strides, vector paddings, + const Tensor *input, Tensor *output); -void Pool2x2Avg(vector strides, vector paddings, const Tensor *in_x, - Tensor *out); +void Pool2x2Avgs2p0(vector strides, vector paddings, + const Tensor *in_x, Tensor *out); } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp index 28547b71fca6caea2ff4341b3f832c0035436a72..05d3017f635a040a52d2cc377c8f384dbbd8086c 100644 --- a/src/operators/math/pool_3x3.cpp +++ b/src/operators/math/pool_3x3.cpp @@ -558,15 +558,13 @@ void Pool3x3Max(vector strides, vector paddings, const Tensor *input, const float *input_seg = input_data + c * input_channel_stride; float *output_seg = output_data + c * output_channel_stride; for (int ph = 0; ph < output_height; ph++) { + int hstart = ph * stride - padding; + int hend = min(hstart + 3, input_height); + hstart = max(hstart, 0); for (int pw = 0; pw < output_width; pw++) { - int hstart = ph * stride - padding; int wstart = pw * stride - padding; - int hend = min(hstart + 3, input_height + padding); - int wend = min(wstart + 3, input_width + padding); - hstart = max(hstart, 0); + int wend = min(wstart + 3, input_width); wstart = max(wstart, 0); - hend = min(hend, input_height); - wend = min(wend, input_width); const float *pos1 = input_seg + hstart * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;