From ad5087c9df2377d619a492152a4adc3f83c6afb5 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 3 Dec 2018 00:21:58 +0800 Subject: [PATCH] Optimize int8 depthwise conv --- src/operators/kernel/arm/quantize_kernel.cpp | 4 +- .../kernel/central-arm-func/conv_arm_func.h | 4 +- src/operators/math/depthwise_conv3x3.h | 4 +- src/operators/math/depthwise_conv3x3_int8.cpp | 530 +++++++++++++++--- 4 files changed, 447 insertions(+), 95 deletions(-) diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index 4c6e6452c2..ca3fa71f98 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -88,8 +88,8 @@ template <> inline int8_t Round(const float &x) { float v = std::round(x); int32_t q = static_cast(v); - if (abs(abs(q - v) - 0.5) <= 0) { - if (abs(q) % 2 != 0) { + if (std::abs(std::abs(q - v) - 0.5) <= 0) { + if (std::abs(q) % 2 != 0) { q = q + ((q > 0) ? -1 : 1); } } diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 00cb4dfb04..8a88e0e10b 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -180,10 +180,10 @@ inline void DepthwiseConv3x3(const ConvParam ¶m) { Tensor in_batch = input->Slice(i, i + 1); Tensor out_batch = output->Slice(i, i + 1); if (strides[0] == 1) { - math::DepthwiseConv3x3s1(in_batch, *filter, paddings, + math::DepthwiseConv3x3S1(in_batch, *filter, paddings, &out_batch); } else if (strides[0] == 2) { - math::DepthwiseConv3x3s2(in_batch, *filter, paddings, + math::DepthwiseConv3x3S2(in_batch, *filter, paddings, &out_batch); } else { // math::DepthwiseConv3x3(input_pad, *filter, diff --git a/src/operators/math/depthwise_conv3x3.h b/src/operators/math/depthwise_conv3x3.h index 35d6c7d3f0..ca8f45fa51 100644 --- a/src/operators/math/depthwise_conv3x3.h +++ b/src/operators/math/depthwise_conv3x3.h @@ -74,13 +74,13 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, // framework::Tensor *output); template -void DepthwiseConv3x3s1(const framework::Tensor &input, +void DepthwiseConv3x3S1(const framework::Tensor &input, const framework::Tensor &filter, const std::vector &paddings, framework::Tensor *output); template -void DepthwiseConv3x3s2(const framework::Tensor &input, +void DepthwiseConv3x3S2(const framework::Tensor &input, const framework::Tensor &filter, const std::vector &paddings, framework::Tensor *output); diff --git a/src/operators/math/depthwise_conv3x3_int8.cpp b/src/operators/math/depthwise_conv3x3_int8.cpp index 38081ea6bb..9b4c6096ec 100644 --- a/src/operators/math/depthwise_conv3x3_int8.cpp +++ b/src/operators/math/depthwise_conv3x3_int8.cpp @@ -12,12 +12,300 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#if defined(__ARM_NEON__) && !defined(__aarch64__) + #include "operators/math/depthwise_conv3x3.h" +#ifdef __ARM_NEON__ +#include +#endif namespace paddle_mobile { namespace operators { namespace math { +template +inline void Depth3x3ValidColLoadInput(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride); +} + +template <> +inline void Depth3x3ValidColLoadInput<1>(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + int8_t fake_input[3][8]; + if (valid_cols == 1) { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + } + } else if (valid_cols == 2) { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + } + } else { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + fake_input[2][i] = input[2]; + } + } + int8x8_t input0 = vld1_s8(fake_input[0]); + int8x8_t input1 = vld1_s8(fake_input[1]); + int8x8_t input2 = vld1_s8(fake_input[2]); + y0[0] = vmovl_s8(input0); + y1[0] = vmovl_s8(input1); + y2[0] = vmovl_s8(input2); + y0[1] = vextq_s16(y0[0], y0[0], 1); + y0[2] = vextq_s16(y0[0], y0[0], 2); + y1[1] = vextq_s16(y1[0], y1[0], 1); + y1[2] = vextq_s16(y1[0], y1[0], 2); + y2[1] = vextq_s16(y2[0], y2[0], 1); + y2[2] = vextq_s16(y2[0], y2[0], 2); +} + +template <> +inline void Depth3x3ValidColLoadInput<2>(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + int8_t fake_input[3][13]; + if (valid_cols == 1) { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + } + } else if (valid_cols == 2) { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + } + } else { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + fake_input[2][i] = input[2]; + } + } + int8x8x2_t input0 = vld2_s8(fake_input[0]); + int8x8x2_t input1 = vld2_s8(fake_input[1]); + int8x8x2_t input2 = vld2_s8(fake_input[2]); + y0[0] = vmovl_s8(input0.val[0]); + y0[1] = vmovl_s8(input0.val[1]); + y0[2] = vextq_s16(y0[0], y0[0], 1); + y1[0] = vmovl_s8(input1.val[0]); + y1[1] = vmovl_s8(input1.val[1]); + y1[2] = vextq_s16(y1[0], y1[0], 1); + y2[0] = vmovl_s8(input2.val[0]); + y2[1] = vmovl_s8(input2.val[1]); + y2[2] = vextq_s16(y2[0], y2[0], 1); +} + +template +inline void DepthwiseConv3x3ValidCol(const int8_t *input, const int8_t *filter, + const int h_output, const int h_output_end, + const int w_output, const int input_h, + const int input_w, const int padding_h, + const int padding_w, const int output_w, + int32_t *output) { + const int w_in_start = -padding_w + w_output * Stride_w; + const int w_in_end = w_in_start + 3; + const int w_start = w_in_start > 0 ? w_in_start : 0; + const int w_end = w_in_end < input_w ? w_in_end : input_w; + int remain_start = h_output; + +#ifdef __ARM_NEON__ + int output_tiles = (h_output_end - h_output) / 6; + remain_start = h_output + output_tiles * 6; + int input_h_start = h_output * Stride_h - padding_h; + size_t input_offset = input_h_start * input_w + w_start; + size_t output_offset = h_output * output_w + w_output; + int16x8_t _input[3][3]; + int16x4_t _kernel[3]; + int32x4_t _sum0, _sum1; + const int8_t *filter_ptr = filter; + asm volatile( + "mov r0, #3 \n" + "vld1.s8 d10, [%[filter]], r0 \n" + "vld1.s8 d11, [%[filter]], r0 \n" + "vld1.s8 d12, [%[filter]] \n" + "vtrn.8 d10, d11 \n" + "vtrn.8 d12, d13 \n" + "vtrn.16 d10, d12 \n" + "vtrn.16 d11, d13 \n" + "vmovl.s8 q7, d10 \n" + "vmovl.s8 q8, d11 \n" + "vmovl.s8 q9, d12 \n" + "vmov.32 %[_kernel0], d14 \n" + "vmov.32 %[_kernel1], d16 \n" + "vmov.32 %[_kernel2], d18 \n" + : [_kernel0] "+w"(_kernel[0]), [_kernel1] "+w"(_kernel[1]), + [_kernel2] "+w"(_kernel[2]) + : [filter] "r"(filter_ptr) + : "memory", "q5", "q6", "q7", "q8", "q9", "r0"); + int valid_cols = w_end - w_start; + for (int h = 0; h < output_tiles * 6; h += 6) { + int32_t *output0 = output + output_offset; + int32_t *output1 = output0 + output_w; + int32_t *output2 = output1 + output_w; + int32_t *output3 = output2 + output_w; + int32_t *output4 = output3 + output_w; + int32_t *output5 = output4 + output_w; + Depth3x3ValidColLoadInput(input + input_offset, input_w, + valid_cols, _input[0], _input[1], + _input[2]); + _sum0 = veorq_s32(_sum0, _sum0); + _sum1 = veorq_s32(_sum1, _sum1); + for (int w_in = 0; w_in < valid_cols; ++w_in) { + int index = w_in + w_start - w_in_start; + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][0]), + _kernel[index], 0); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][1]), + _kernel[index], 1); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][2]), + _kernel[index], 2); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][0]), + _kernel[index], 0); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][1]), + _kernel[index], 1); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][2]), + _kernel[index], 2); + } + vst1q_lane_s32(output0, _sum0, 0); + vst1q_lane_s32(output1, _sum0, 1); + vst1q_lane_s32(output2, _sum0, 2); + vst1q_lane_s32(output3, _sum0, 3); + vst1q_lane_s32(output4, _sum1, 0); + vst1q_lane_s32(output5, _sum1, 1); + input_offset += 6 * Stride_h * input_w; + output_offset += 6 * output_w; + } +#endif + for (int h = remain_start; h < h_output_end; ++h) { + int32_t value = 0; + const int h_in_start = -padding_h + h * Stride_h; + for (int i = 0; i < 3; ++i) { + for (int w_in = w_start; w_in < w_end; ++w_in) { + value += filter[i * 3 + (w_in - w_in_start)] * + input[(h_in_start + i) * input_w + w_in]; + } + } + output[h * output_w + w_output] = value; + } +} + +#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \ + for (int w = start; w < end; ++w) { \ + const int w_in_start = -padding_w + w * Stride_w; \ + const int w_in_end = w_in_start + 3; \ + const int w_start = w_in_start > 0 ? w_in_start : 0; \ + const int w_end = w_in_end < input_w ? w_in_end : input_w; \ + int32_t value = 0; \ + for (int h_in = h_start; h_in < h_end; ++h_in) { \ + for (int w_in = w_start; w_in < w_end; ++w_in) { \ + value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \ + input[h_in * input_w + w_in]; \ + } \ + } \ + output_ptr[w] = value; \ + } + +template +inline void Depth3x3NormalRowLoadInput(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride); +} + +template <> +inline void Depth3x3NormalRowLoadInput<1>(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + int8x8_t x0 = vld1_s8(input); + y0 = vmovl_s8(x0); + y1 = vextq_s16(y0, y0, 1); + y2 = vextq_s16(y1, y1, 1); +} + +template <> +inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + int8x8x2_t x0 = vld2_s8(input); + y0 = vmovl_s8(x0.val[0]); + y1 = vmovl_s8(x0.val[1]); + y2 = vextq_s16(y0, y0, 1); +} + +template +inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter, + const int h_output, const int input_h, + const int input_w, const int padding_h, + const int padding_w, const int output_w, + int32_t *output) { + const int h_in_start = -padding_h + h_output * Stride_h; + const int h_in_end = h_in_start + 3; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int h_end = h_in_end < input_h ? h_in_end : input_h; + + int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; + int valid_w_end = output_w - valid_w_start; + + int32_t *output_ptr = output + h_output * output_w; + // border left + DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) + // middle + int remain_start = valid_w_start; +#ifdef __ARM_NEON__ + int output_tiles = (valid_w_end - valid_w_start) / 6; + remain_start = valid_w_start + output_tiles * 6; + int32x4_t _sum0, _sum1; + int16x8_t y0, y1, y2; + int16x4_t _kernel[3]; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + int8x8_t w0 = vld1_s8(filter + index * 3); + int16x8_t w1 = vmovl_s8(w0); + _kernel[index] = vget_low_s16(w1); + } + for (int w = 0; w < output_tiles * 6; w += 6) { + _sum0 = veorq_s32(_sum0, _sum0); + _sum1 = veorq_s32(_sum1, _sum1); + int output_offset = valid_w_start + w; + int input_w_offset = output_offset * Stride_w - padding_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + Depth3x3NormalRowLoadInput( + input + h_in * input_w + input_w_offset, y0, y1, y2); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y0), _kernel[index], 0); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y1), _kernel[index], 1); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y2), _kernel[index], 2); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y0), _kernel[index], 0); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y1), _kernel[index], 1); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y2), _kernel[index], 2); + } + vst1q_s32(output_ptr + output_offset, _sum0); + vst1q_lane_s32(output_ptr + output_offset + 4, _sum1, 0); + vst1q_lane_s32(output_ptr + output_offset + 5, _sum1, 1); + } +#endif + for (int w = remain_start; w < valid_w_end; ++w) { + int32_t value = 0; + int input_start = -padding_w + w * Stride_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + for (int j = 0; j < 3; ++j) { + value += filter[(h_in - h_in_start) * 3 + j] * + input[h_in * input_w + j + input_start]; + } + } + output_ptr[w] = value; + } + // border right + DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w) +} + // template<> // void DepthwiseConv3x3( // const framework::Tensor *input, const framework::Tensor *filter, @@ -27,44 +315,72 @@ namespace math { // } template <> -void DepthwiseConv3x3s1(const framework::Tensor &input, +void DepthwiseConv3x3S1(const framework::Tensor &input, const framework::Tensor &filter, const std::vector &paddings, framework::Tensor *output) { const int8_t *input_data = input.data(); const int8_t *filter_data = filter.data(); int32_t *out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; int input_h = input.dims()[2]; int input_w = input.dims()[3]; - int output_c = output->dims()[1]; int output_h = output->dims()[2]; int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; int image_size = input_h * input_w; int out_image_size = output_h * output_w; -#if __aarch64__ - // TODO(hjchen2) -#else + int valid_h_start = padding_h; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = padding_w; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + #pragma omp parallel for - for (int g = 0; g < input_c; ++g) { - const int8_t* input_ptr = input_data + g * image_size; - const int8_t* filter_ptr = filter_data + g * 9; - int32_t* output_ptr = out_data + g * out_image_size; - int loops = (input_w - 2) / 6; - int remain = input_w - 2 - loops * 6; - for (int h = 0; h < input_h - 5 /*(input_h - 2) - 3*/; h += 4) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - const int8_t* input_ptr4 = input_ptr3 + input_w; - const int8_t* input_ptr5 = input_ptr4 + input_w; - int32_t* output_ptr0 = output_ptr + h * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int32_t* output_ptr2 = output_ptr1 + output_w; - int32_t* output_ptr3 = output_ptr2 + output_w; - int loop = loops; + for (int g = 0; g < input.dims()[1]; ++g) { + const int8_t *input_ptr = input_data + g * image_size; + const int8_t *filter_ptr = filter_data + g * 9; + int32_t *output_ptr = out_data + g * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + DepthwiseConv3x3ValidCol<1, 1>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + DepthwiseConv3x3ValidCol<1, 1>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // valid + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { + const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + const int8_t *input_ptr4 = input_ptr3 + input_w; + const int8_t *input_ptr5 = input_ptr4 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int32_t *output_ptr2 = output_ptr1 + output_w; + int32_t *output_ptr3 = output_ptr2 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -378,27 +694,27 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" "vst1.32 {d10[0]}, [%[output_ptr3]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [output_ptr2] "+r"(output_ptr2), [output_ptr3] "+r"(output_ptr3), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); } // remain height - int start_h = (input_h - 2) & 0xFFFC; - for (int h = start_h; h < input_h - 3 /*(input_h - 2) - 1*/; h += 2) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - int32_t* output_ptr0 = output_ptr + h * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int loop = loops; + int start_h = valid_h_start + (valid_h & 0xFFFC); + for (int h = start_h; h < valid_h_end - 1; h += 2) { + const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -416,9 +732,9 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #6 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #6 \n" // loop 6 widths "loop_2h6w_%=: \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n" @@ -590,23 +906,23 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "blt end_%= \n" "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "r0"); } - start_h = (input_h - 2) & 0xFFFE; - if (start_h < input_h - 2) { - const int8_t* input_ptr0 = input_ptr + start_h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - int32_t* output_ptr0 = output_ptr + start_h * output_w; - int loop = loops; + start_h = valid_h_start + (valid_h & 0xFFFE); + if (start_h < valid_h_end) { + const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + int32_t *output_ptr0 = output_ptr + start_h * output_w + valid_w_start; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -624,9 +940,9 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #6 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #6 \n" // loop 6 widths "loop_1h6w_%=: \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n" @@ -737,57 +1053,91 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "cmp %[remain], #1 \n" "blt end_%= \n" "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "r0"); } } -#endif // __aarch64__ } template <> -void DepthwiseConv3x3s2(const framework::Tensor &input, +void DepthwiseConv3x3S2(const framework::Tensor &input, const framework::Tensor &filter, const std::vector &paddings, framework::Tensor *output) { const int8_t *input_data = input.data(); const int8_t *filter_data = filter.data(); int32_t *out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; int input_h = input.dims()[2]; int input_w = input.dims()[3]; - int output_c = output->dims()[1]; int output_h = output->dims()[2]; int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; int image_size = input_h * input_w; int out_image_size = output_h * output_w; -#if __aarch64__ - // TODO(hjchen2) -#else + int valid_h_start = (padding_h + 1) / 2; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = (padding_w + 1) / 2; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + + // DLOG << "valid_h_start: " << valid_h_start; + // DLOG << "valid_h_end: " << valid_h_end; + // DLOG << "valid_w_start: " << valid_w_start; + // DLOG << "valid_w_end: " << valid_w_end; + #pragma omp parallel for - for (int g = 0; g < input_c; ++g) { - const int8_t* input_ptr = input_data + g * image_size; - const int8_t* filter_ptr = filter_data + g * 9; - int32_t* output_ptr = out_data + g * out_image_size; - int loops = output_w / 6; - int remain = output_w - loops * 6; - for (int h = 0; h < input_h - 6 /*(input_h - 1) - 5*/; h += 6) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - const int8_t* input_ptr4 = input_ptr3 + input_w; - const int8_t* input_ptr5 = input_ptr4 + input_w; - const int8_t* input_ptr6 = input_ptr5 + input_w; - int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int32_t* output_ptr2 = output_ptr1 + output_w; - int loop = loops; + for (int g = 0; g < input.dims()[1]; ++g) { + const int8_t *input_ptr = input_data + g * image_size; + const int8_t *filter_ptr = filter_data + g * 9; + int32_t *output_ptr = out_data + g * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + DepthwiseConv3x3ValidCol<2, 2>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + DepthwiseConv3x3ValidCol<2, 2>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // valid + int input_w_start = 2 * valid_w_start - padding_w; + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 2; h += 3) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const int8_t *input_ptr0 = input_ptr + offset; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + const int8_t *input_ptr4 = input_ptr3 + input_w; + const int8_t *input_ptr5 = input_ptr4 + input_w; + const int8_t *input_ptr6 = input_ptr5 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int32_t *output_ptr2 = output_ptr1 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -805,9 +1155,9 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #12 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #12 \n" // loop 6 widths "loop_3h6w_%=: \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" @@ -1050,25 +1400,26 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [output_ptr2] "+r"(output_ptr2), [input_ptr6] "+r"(input_ptr6), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); } - int start_h = (output_h / 3) * 6; - for (int h = start_h; h < input_h - 2 /*(input_h - 1) - 1*/; h += 2) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; - int loop = loops; + int start_h = valid_h_start + valid_h / 3 * 3; + for (int h = start_h; h < valid_h_end; ++h) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const int8_t *input_ptr0 = input_ptr + offset; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -1086,9 +1437,9 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #12 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #12 \n" // loop 6 widths "loop_1h6w_%=: \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" @@ -1192,18 +1543,19 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, "cmp %[remain], #1 \n" "blt end_%= \n" "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "r0"); } } -#endif // __aarch64__ } } // namespace math } // namespace operators } // namespace paddle_mobile + +#endif -- GitLab