diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp index 24c68a090592dca70bc403861d0684d375955dbf..a1da00bb199d43e1a4b69e9253dec832c6f92842 100644 --- a/src/operators/kernel/arm/conv_add_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_kernel.cpp @@ -42,8 +42,6 @@ void expand_bias(Tensor &bias, int axis, const DDim &dDim) { template <> void ConvAddKernel::Compute( const FushionConvAddParam ¶m) const { - DLOG << param; - const Tensor *input = param.Input(); Tensor filter = *param.Filter(); Tensor bias = *param.Bias(); diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 546ae33407d4c5affd6459d4167ba5b373887f12..1ec022ffab41fc41084220651d286b20ea43d7bb 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -21,8 +21,6 @@ namespace operators { template <> void ConvKernel::Compute(const ConvParam ¶m) const { - LOG(kLOG_DEBUG) << param; - const Tensor *input = param.Input(); Tensor filter = *param.Filter(); Tensor *output = param.Output(); @@ -32,8 +30,6 @@ void ConvKernel::Compute(const ConvParam ¶m) const { std::vector paddings = param.Paddings(); std::vector dilations = param.Dilations(); - // DLOG << " compute end get Attrs " << strides[0]; - const int batch_size = static_cast(input->dims()[0]); std::vector filter_shape_vec(framework::vectorize(filter.dims())); @@ -66,7 +62,6 @@ void ConvKernel::Compute(const ConvParam ¶m) const { framework::DDim filter_matrix_shape = {filter.dims()[0], filter.numel() / filter.dims()[0]}; filter.Resize(filter_matrix_shape); - DLOG << " filter.dims() = " << filter.dims(); framework::DDim output_matrix_shape = { output->dims()[1], output->numel() / (output->dims()[0] * output->dims()[1])}; diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp index 2809a802a6cf94c931e409aecfa0090139624a46..646f538d7a637b4b009b51b9305d607325a8e54e 100644 --- a/src/operators/kernel/arm/pool_kernel.cpp +++ b/src/operators/kernel/arm/pool_kernel.cpp @@ -56,22 +56,23 @@ void PoolKernel::Compute(const PoolParam ¶m) const { paddings[i] = 0; ksize[i] = static_cast(in_x->dims()[i + 2]); } - } + } else if (ksize[0] == 3 && ksize[0] == ksize[1]) { + if (pooling_type == "max") { + math::Pool3x3Max(strides, paddings, in_x, out); + } else if (pooling_type == "avg") { + math::Pool3x3Avg(strides, paddings, in_x, out); + } - PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); - - // if (param.isGlobalPooling() || ksize[0] != ksize[1] || - // strides[0] != strides[1] || strides[1] != 2 || - // paddings[0] != paddings[1] || paddings[1] > 1) { - // PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); - // - // } else if (ksize[0] == 2) { - // - // } else if (ksize[0] == 3) { - // - // } else { - // PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); - // } + } else if (ksize[0] == 2 && ksize[0] == ksize[1]) { + if (pooling_type == "max") { + math::Pool2x2Max(strides, paddings, in_x, out); + } else if (pooling_type == "avg") { + math::Pool2x2Avg(strides, paddings, in_x, out); + } + + } else { + PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); + } } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96d277c136b4656dbb1fd682489bd7dee5c3af0e --- /dev/null +++ b/src/operators/math/pool_2x2.cpp @@ -0,0 +1,176 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP +#include "pool_2x2.h" + +namespace paddle_mobile { +namespace operators { +namespace math { + +void Pool2x2Max(vector strides, vector paddings, const Tensor *input, + Tensor *output) { +#if __ARM_NEON + const int batch_size = input->dims()[0]; + + const int input_height = input->dims()[2]; + + const int input_width = input->dims()[3]; + + const int output_channels = output->dims()[1]; + + int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = 2; + const int ksize_width = 2; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + int out_w_num = output_width >> 2; + const int in_h_num = output_height >> 1; + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + int remain = output_width - out_w_num << 2; + for (int i = 0; i < batch_size; ++i) { + for (int c = 0; c < output_channels; ++c) { + const float *input_data_chanel_row_next = input_data + input_width; + for (; output_height > 0; output_height--) { + if (out_w_num > 0) { + asm volatile( + "max_loop: \n\t" + "vld1.f32 {q0,q1}, [%[in_ptr1]]! \n\t" + "vld1.f32 {q2,q3}, [%[in_ptr2]]! \n\t" + "vmax.f32 q0, q0, q2 \n\t" + "vmax.f32 q1, q1, q3 \n\t" + "vpmax.f32 d4, d0, d1 \n\t" + "vpmax.f32 d5, d2, d3 \n\t" + "subs %[out_w_num], #1 \n\t" + "vst1.32 {q2}, [%[out_ptr]]! \n\t" + "bne max_loop \n\t" + : [in_ptr1] "+r"(input_data), + [in_ptr2] "+r"(input_data_chanel_row_next), + [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num) + : + : "memory", "q0", "q1", "q2", "q3"); + } + + for (; remain > 0; remain--) { + float max_row1 = std::max(input_data[0], input_data[1]); + float max_row2 = std::max(input_data_chanel_row_next[0], + input_data_chanel_row_next[1]); + *output_data = std::max(max_row1, max_row2); + input_data += 2; + input_data_chanel_row_next += 2; + output_data++; + } + } + input_data += input_channel_stride; + output_data += output_channel_stride; + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +#endif +} + +void Pool2x2Avg(vector strides, vector paddings, const Tensor *input, + Tensor *output) { +#if __ARM_NEON + const int batch_size = input->dims()[0]; + + const int input_height = input->dims()[2]; + + const int input_width = input->dims()[3]; + + const int output_channels = output->dims()[1]; + + int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = 2; + const int ksize_width = 2; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + int out_w_num = output_width >> 2; + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f}; + int remain = output_width - out_w_num << 2; + for (int i = 0; i < batch_size; ++i) { + for (int c = 0; c < output_channels; ++c) { + const float *input_data_chanel_row_next = input_data + input_width; + for (; output_height > 0; output_height--) { + if (out_w_num > 0) { + asm volatile( + "avg_loop: \n\t" + "vld1.32 {q0,q1}, [%[in_ptr1]]! \n\t" + "vld1.32 {q2,q3}, [%[in_ptr2]]! \n\t" + "vadd.f32 q0, q0, q2 \n\t" + "vadd.f32 q1, q1, q3 \n\t" + "vpadd.f32 d4, d0, d1 \n\t" + "vpadd.f32 d5, d2, d3 \n\t" + "vld1.32 {q4}, [%[vqua]]! \n\t" + "vmul.f32 q2, q2, q4 \n\t" + "subs %[out_w_num], #1 \n\t" + "vst1.32 {q2}, [%[out_ptr]]! \n\t" + "bne avg_loop \n\t" + : [in_ptr1] "+r"(input_data), + [in_ptr2] "+r"(input_data_chanel_row_next), + [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num) + : [vqua] "r"(vqua) + : "memory", "q0", "q1", "q2", "q3", "q4"); + } + + for (; remain > 0; remain--) { + float max_row1 = std::max(input_data[0], input_data[1]); + float max_row2 = std::max(input_data_chanel_row_next[0], + input_data_chanel_row_next[1]); + *output_data = std::max(max_row1, max_row2); + input_data += 2; + input_data_chanel_row_next += 2; + output_data++; + } + } + input_data += input_channel_stride; + output_data += output_channel_stride; + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +#endif +} + +//} +} // namespace math + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h index 46e9e36470ceeee39563dc410e63a09aaec973bb..3fb0d24ba2ce854e8e63c066222e355e2c84dabb 100644 --- a/src/operators/math/pool_2x2.h +++ b/src/operators/math/pool_2x2.h @@ -16,16 +16,22 @@ limitations under the License. */ #pragma once +#include "framework/tensor.h" #if __ARM_NEON #include #endif // __ARM_NEON - -static void Pool2x2Max() { - // todo impl with neon -} - -static void Pool2x2Avg() { - // todo impl with neon -} - +namespace paddle_mobile { +namespace operators { +namespace math { +using framework::Tensor; +using std::vector; + +void Pool2x2Max(vector strides, vector paddings, const Tensor *input, + Tensor *output); + +void Pool2x2Avg(vector strides, vector paddings, const Tensor *in_x, + Tensor *out); +} // namespace math +} // namespace operators +} // namespace paddle_mobile #endif diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f404b644d78cb1b94eb96a2d587fead2575b3814 --- /dev/null +++ b/src/operators/math/pool_3x3.cpp @@ -0,0 +1,232 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP +#define __ARM_NEON true +#include "pool_3x3.h" +#include "framework/tensor.h" +#if __ARM_NEON +#include +#endif // __ARM_NEON + +namespace paddle_mobile { +namespace operators { +namespace math { +using framework::Tensor; +using std::max; +using std::min; +using std::vector; + +void Pool3x3Max(vector strides, vector paddings, const Tensor *input, + Tensor *output) { +#if __ARM_NEON + const int batch_size = input->dims()[0]; + + const int input_height = input->dims()[2]; + + const int input_width = input->dims()[3]; + + const int output_channels = output->dims()[1]; + + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int _kernel_size = 3; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const float negative_max = -INT_MAX; + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + const float *pos1, *pos2, *pos3, *output_ptr; + int hstart, wstart, hend, wend; + for (int i = 0; i < batch_size; ++i) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ph++) { + for (int pw = 0; pw < output_width; pw++) { + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + hend = min(hstart + _kernel_size, input_height + padding_height); + wend = min(wstart + _kernel_size, input_width + padding_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, input_height); + wend = min(wend, input_width); + pos1 = input_data + hstart * input_width + wstart; + pos2 = input_data + (hstart + 1) * input_width + wstart; + pos3 = input_data + (hstart + 2) * input_width + wstart; + output_ptr = output_data + ph * output_width + pw; + + if (hend - hstart != 3 || wend - wstart != 3) { + float max_value = -INT_MAX; + for (int h = hstart; h < hend; h++) { + for (int w = wstart; w < wend; w++) { + float value = input_data[h * input_width + w]; + if (value > max_value) { + max_value = value; + } + } + } + output_data[ph * output_width + pw] = max_value; + } else { +#if defined(ARMV7) + asm volatile( + "vld1.32 {q1}, [%[pos1]] \n\t" + "vld1.32 {q2}, [%[pos2]] \n\t" + "vld1.32 {q3}, [%[pos3]] \n\t" + "vmax.f32 q1, q1, q2 \n\t" + "vmax.f32 q2, q1, q3 \n\t" + "vmov.f32 d5[1], %[negative_max] \n\t" + "vpmax.f32 d6, d4, d5 \n\t" + "vpmax.f32 d7, d6, d6 \n\t" + "vst1.32 {d7[0]},[%[output_ptr]] \n\t" + : + : [input_data] "r"(input_data), [pos1] "r"(pos1), + [pos2] "r"(pos2), [pos3] "r"(pos3), + [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) + : "memory", "q1", "q2", "q3", "q4"); +#else + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos2); + const float32x4_t data3 = vld1q_f32(pos3); + const float32x4_t max_data = + vmaxq_f32(vmaxq_f32(data1, data3), data2); + float32x2_t res = + vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)), + vget_low_f32(max_data)); + res = vpmax_f32(res, res); + output_data[ph * output_width + pw] = vget_lane_f32(res, 0); +#endif + } + } + } + input_data += input_channel_stride; + output_data += output_channel_stride; + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +#endif +} + +void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, + Tensor *output) { +#if __ARM_NEON + const int batch_size = input->dims()[0]; + + const int input_height = input->dims()[2]; + + const int input_width = input->dims()[3]; + + const int output_channels = output->dims()[1]; + + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int _kernel_size = 3; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + const float zero = 0; + const float nine = 1.0 / 9.0; + const float nine_ptr[] = {nine, nine}; + + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + for (int i = 0; i < batch_size; ++i) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ph++) { + for (int pw = 0; pw < output_width; pw++) { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + _kernel_size, input_height + padding_height); + int wend = min(wstart + _kernel_size, input_width + padding_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, input_height); + wend = min(wend, input_width); + const float *pos1 = input_data + hstart * input_width + wstart; + const float *pos2 = input_data + (hstart + 1) * input_width + wstart; + const float *pos3 = input_data + (hstart + 2) * input_width + wstart; + const float *output_ptr = output_data + ph * output_width + pw; + + if (hend - hstart != 3 || wend - wstart != 3) { + float sum = 0; + for (int h = hstart; h < hend; h++) { + for (int w = wstart; w < wend; w++) { + sum += input_data[h * input_width + w]; + } + } + output_data[ph * output_width + pw] = sum / 9.0; + } else { +#if defined(ARMV7) + + asm volatile( + "vld1.32 {q1}, [%[pos1]] \n\t" + "vld1.32 {q2}, [%[pos2]] \n\t" + "vld1.32 {q3}, [%[pos3]] \n\t" + "vadd.f32 q1, q1, q2 \n\t" + "vadd.f32 q2, q1, q3 \n\t" + "vmov.f32 d5[1], %[zero] \n\t" + "vpadd.f32 d6, d4, d5 \n\t" + "vpadd.f32 d6, d6, d6 \n\t" + "vld1.f32 d7, [%[nine_ptr]]! \n\t" + "vmul.f32 d6,d7 \n\t" + "vst1.32 {d6[0]},[%[output_ptr]] \n\t" + : + : [input_data] "r"(input_data), [pos1] "r"(pos1), + [pos2] "r"(pos2), [pos3] "r"(pos3), + [output_ptr] "r"(output_ptr), [zero] "r"(zero), + [nine_ptr] "r"(nine_ptr) + : "memory", "r6", "q1", "q2", "q3", "q4"); +#else + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos2); + const float32x4_t data3 = vld1q_f32(pos3); + const float32x4_t sum_data = + vaddq_f32(vaddq_f32(data1, data3), data2); + float32x2_t res = + vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)), + vget_low_f32(sum_data)); + res = vpadd_f32(res, res); + output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0; +#endif + } + } + } + input_data += input_channel_stride; + output_data += output_channel_stride; + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +#endif +} +} // namespace math +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/pool3x3.h b/src/operators/math/pool_3x3.h similarity index 61% rename from src/operators/math/pool3x3.h rename to src/operators/math/pool_3x3.h index 164958288de5cf3bb37dcb2d37c7fe08b7bd7a1a..22a398084390701aefc8815c9aa93b82b4c4ec7b 100644 --- a/src/operators/math/pool3x3.h +++ b/src/operators/math/pool_3x3.h @@ -16,16 +16,24 @@ limitations under the License. */ #pragma once +#include "framework/tensor.h" #if __ARM_NEON #include #endif // __ARM_NEON -static void Pool3x3Max() { - // todo impl with neon -} +namespace paddle_mobile { +namespace operators { +namespace math { +using framework::Tensor; +using std::vector; -static void Pool3x3Avg() { - // todo impl with neon -} +void Pool3x3Max(vector strides, vector paddings, const Tensor *input, + Tensor *output); + +void Pool3x3Avg(vector strides, vector paddings, const Tensor *in_x, + Tensor *out); +} // namespace math +} // namespace operators +} // namespace paddle_mobile #endif diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp index 11bce0978f789f4f02b44fbb24fdd8bd1219257e..4287408394f1a7f407154938f3e83e9fac3543a2 100644 --- a/src/operators/math/pooling.cpp +++ b/src/operators/math/pooling.cpp @@ -38,9 +38,7 @@ class PoolFunctor { const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; - if (output == nullptr) { - DLOG << "output tensor is null"; - } + const int output_channels = output->dims()[1]; const int output_height = output->dims()[2]; diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h index fc6aabb5f13fdedd9dfe9877748aa4d58b3afe36..bc2ecf41d224c2b0fd518d44fecc3f688d98ee19 100644 --- a/src/operators/math/pooling.h +++ b/src/operators/math/pooling.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "common/log.h" #include "framework/tensor.h" +#include "pool_2x2.h" +#include "pool_3x3.h" namespace paddle_mobile { namespace operators {