提交 0a38f733 编写于 作者: S smilejames 提交者: GitHub

Merge pull request #889 from smilejames/develop

fix bug in pool2d 2x2 math function
...@@ -76,15 +76,17 @@ void PoolCompute(const PoolParam<CPU> &param) { ...@@ -76,15 +76,17 @@ void PoolCompute(const PoolParam<CPU> &param) {
} }
} }
} else if (ksize[0] == 2 && ksize[0] == ksize[1]) { } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
strides[0] == strides[1] && paddings[0] == paddings[1] &&
paddings[1] == 0) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
#else #else
if (pooling_type == "max") { if (pooling_type == "max") {
math::Pool2x2Max(strides, paddings, in_x, out); math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
math::Pool2x2Avg(strides, paddings, in_x, out); math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
} }
#endif #endif
#else #else
......
...@@ -20,21 +20,15 @@ limitations under the License. */ ...@@ -20,21 +20,15 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX __FLT_MAX__
void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input, void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
Tensor *output) { const Tensor *input, Tensor *output) {
#if __ARM_NEON
#if __aarch64__
#else
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int input_height = input->dims()[2]; const int input_height = input->dims()[2];
const int input_width = input->dims()[3]; const int input_width = input->dims()[3];
const int output_channels = output->dims()[1]; const int output_channels = output->dims()[1];
int output_height = output->dims()[2]; int output_height = output->dims()[2];
const int output_width = output->dims()[3]; const int output_width = output->dims()[3];
const int ksize_height = 2; const int ksize_height = 2;
...@@ -47,72 +41,110 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -47,72 +41,110 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const int input_channel_stride = input_height * input_width; const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width; const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>(); float *output_data = output->mutable_data<float>();
int out_w_num = output_width >> 2; int w1 = input_width / 16;
const int in_h_num = output_height >> 1; int _w1 = input_width % 16;
const int input_batch_stride = output_channels * input_channel_stride; int w2 = _w1 / 4;
const int output_batch_stride = output_channels * output_channel_stride; int _w2 = _w1 % 4;
int remain = output_width - out_w_num << 2;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_data_chanel_row_next = input_data + input_width; for (int ph = 0; ph < input_height; ph += 2) {
for (; output_height > 0; output_height--) { const float *in_ptr1 = input_data + i * input_batch_stride +
if (out_w_num > 0) { c * input_channel_stride + ph * input_width;
asm volatile( const float *in_ptr2 = in_ptr1 + input_width;
"max_loop: \n\t" if (ph + 1 >= input_height) {
"vld1.f32 {q0,q1}, [%[in_ptr1]]! \n\t" in_ptr2 = static_cast<float *>(
"vld1.f32 {q2,q3}, [%[in_ptr2]]! \n\t" paddle_mobile::memory::Alloc(sizeof(float) * input_width));
"vmax.f32 q0, q0, q2 \n\t" memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
"vmax.f32 q1, q1, q3 \n\t" sizeof(float) * input_width);
"vpmax.f32 d4, d0, d1 \n\t"
"vpmax.f32 d5, d2, d3 \n\t"
"subs %[out_w_num], #1 \n\t"
"vst1.32 {q2}, [%[out_ptr]]! \n\t"
"bne max_loop \n\t"
: [in_ptr1] "+r"(input_data),
[in_ptr2] "+r"(input_data_chanel_row_next),
[out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
:
: "memory", "q0", "q1", "q2", "q3");
} }
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vld1.f32 {q0, q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2, q3}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q6, q7}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q8, q9}, [%[in_ptr2]]! \n\t"
for (; remain > 0; remain--) { "vmax.f32 q0, q0, q2 \n\t"
float max_row1 = std::max(input_data[0], input_data[1]); "vmax.f32 q1, q1, q3 \n\t"
float max_row2 = std::max(input_data_chanel_row_next[0],
input_data_chanel_row_next[1]); "vmax.f32 q6, q6, q8 \n\t"
*output_data = std::max(max_row1, max_row2); "vmax.f32 q7, q7, q9 \n\t"
input_data += 2;
input_data_chanel_row_next += 2; "vpmax.f32 d8, d0, d1 \n\t"
output_data++; "vpmax.f32 d9, d2, d3 \n\t"
"vpmax.f32 d10, d12, d13 \n\t"
"vpmax.f32 d11, d14, d15 \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q0}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q1}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q1 \n\t"
"vpmax.f32 d4, d0, d1 \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9");
if (_w2 != 0) {
in_ptr1 += 16 * w1 + 4 * w2;
in_ptr2 += 16 * w1 + 4 * w2;
out_ptr += 8 * w1 + 2 * w2;
if (_w2 == 1) {
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
} else if (_w2 == 2) {
float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
*out_ptr = (temp > temp1) ? temp : temp1;
} else if (_w2 == 3) {
float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
*out_ptr++ = (temp > temp1) ? temp : temp1;
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
}
} }
} }
input_data += input_channel_stride;
output_data += output_channel_stride;
} }
input_data += input_batch_stride;
output_data += output_batch_stride;
} }
#endif
#else
#endif
} }
void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input, void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
Tensor *output) { const Tensor *input, Tensor *output) {
#if __ARM_NEON
#if __aarch64__
#else
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int input_height = input->dims()[2]; const int input_height = input->dims()[2];
const int input_width = input->dims()[3]; const int input_width = input->dims()[3];
const int output_channels = output->dims()[1]; const int output_channels = output->dims()[1];
int output_height = output->dims()[2]; int output_height = output->dims()[2];
const int output_width = output->dims()[3]; const int output_width = output->dims()[3];
const int ksize_height = 2; const int ksize_height = 2;
...@@ -125,59 +157,114 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -125,59 +157,114 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
const int input_channel_stride = input_height * input_width; const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width; const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>(); float *output_data = output->mutable_data<float>();
int out_w_num = output_width >> 2; int w1 = input_width / 16;
const int input_batch_stride = output_channels * input_channel_stride; int _w1 = input_width % 16;
const int output_batch_stride = output_channels * output_channel_stride; int w2 = _w1 / 4;
float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f}; int _w2 = _w1 % 4;
int remain = output_width - out_w_num << 2;
float quarter = 1 / 4;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_data_chanel_row_next = input_data + input_width; for (int ph = 0; ph < input_height; ph += 2) {
for (; output_height > 0; output_height--) { const float *in_ptr1 = input_data + i * input_batch_stride +
if (out_w_num > 0) { c * input_channel_stride + ph * input_width;
asm volatile( const float *in_ptr2 = in_ptr1 + input_width;
"avg_loop: \n\t" if (ph + 1 >= input_height) {
"vld1.32 {q0,q1}, [%[in_ptr1]]! \n\t" in_ptr2 = static_cast<float *>(
"vld1.32 {q2,q3}, [%[in_ptr2]]! \n\t" paddle_mobile::memory::Alloc(sizeof(float) * input_width));
"vadd.f32 q0, q0, q2 \n\t" memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
"vadd.f32 q1, q1, q3 \n\t" sizeof(float) * input_width);
"vpadd.f32 d4, d0, d1 \n\t"
"vpadd.f32 d5, d2, d3 \n\t"
"vld1.32 {q4}, [%[vqua]]! \n\t"
"vmul.f32 q2, q2, q4 \n\t"
"subs %[out_w_num], #1 \n\t"
"vst1.32 {q2}, [%[out_ptr]]! \n\t"
"bne avg_loop \n\t"
: [in_ptr1] "+r"(input_data),
[in_ptr2] "+r"(input_data_chanel_row_next),
[out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
: [vqua] "r"(vqua)
: "memory", "q0", "q1", "q2", "q3", "q4");
} }
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vmov.f32 d0[0], %[quarter] \n\t"
"vld1.f32 {q1, q2}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q3, q4}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q7, q8}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q9, q10}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q3 \n\t"
"vadd.f32 q2, q2, q4 \n\t"
for (; remain > 0; remain--) { "vadd.f32 q7, q7, q9 \n\t"
float max_row1 = std::max(input_data[0], input_data[1]); "vadd.f32 q8, q8, q10 \n\t"
float max_row2 = std::max(input_data_chanel_row_next[0],
input_data_chanel_row_next[1]); "vpadd.f32 d10, d2, d3 \n\t"
*output_data = std::max(max_row1, max_row2); "vpadd.f32 d11, d4, d5 \n\t"
input_data += 2;
input_data_chanel_row_next += 2; "vpadd.f32 d12, d14, d15 \n\t"
output_data++; "vpadd.f32 d13, d16, d17 \n\t"
"vmul.f32 q5, q5, d0[0] \n\t"
"vmul.f32 q6, q6, d0[0] \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q2 \n\t"
"vpadd.f32 d4, d2, d3 \n\t"
"vmul.f32 d4, d4, d0[0] \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
[quarter] "r"(quarter)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10");
if (_w2 != 0) {
in_ptr1 += 16 * w1 + 4 * w2;
in_ptr2 += 16 * w1 + 4 * w2;
out_ptr += 8 * w1 + 2 * w2;
if (_w2 == 1) {
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
} else if (_w2 == 2) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1;
temp += *in_ptr2;
*out_ptr = 0.5 * temp;
} else if (_w2 == 3) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1++;
temp += *in_ptr2++;
*out_ptr++ = 0.5 * temp;
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
}
} }
} }
input_data += input_channel_stride;
output_data += output_channel_stride;
} }
input_data += input_batch_stride;
output_data += output_batch_stride;
} }
#endif
#else
#endif
} }
//} //}
......
...@@ -26,11 +26,11 @@ namespace math { ...@@ -26,11 +26,11 @@ namespace math {
using framework::Tensor; using framework::Tensor;
using std::vector; using std::vector;
void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input, void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
Tensor *output); const Tensor *input, Tensor *output);
void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x, void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
Tensor *out); const Tensor *in_x, Tensor *out);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -558,15 +558,13 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -558,15 +558,13 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const float *input_seg = input_data + c * input_channel_stride; const float *input_seg = input_data + c * input_channel_stride;
float *output_seg = output_data + c * output_channel_stride; float *output_seg = output_data + c * output_channel_stride;
for (int ph = 0; ph < output_height; ph++) { for (int ph = 0; ph < output_height; ph++) {
int hstart = ph * stride - padding;
int hend = min(hstart + 3, input_height);
hstart = max(hstart, 0);
for (int pw = 0; pw < output_width; pw++) { for (int pw = 0; pw < output_width; pw++) {
int hstart = ph * stride - padding;
int wstart = pw * stride - padding; int wstart = pw * stride - padding;
int hend = min(hstart + 3, input_height + padding); int wend = min(wstart + 3, input_width);
int wend = min(wstart + 3, input_width + padding);
hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, input_height);
wend = min(wend, input_width);
const float *pos1 = input_seg + hstart * input_width + wstart; const float *pos1 = input_seg + hstart * input_width + wstart;
const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
const float *pos3 = input_seg + (hstart + 2) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册