提交 94e89540 编写于 作者: H hjchen2

Optimize int8 depthwise conv

上级 b1ca620b
...@@ -88,8 +88,8 @@ template <> ...@@ -88,8 +88,8 @@ template <>
inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) { inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
float v = std::round(x); float v = std::round(x);
int32_t q = static_cast<int32_t>(v); int32_t q = static_cast<int32_t>(v);
if (abs(abs(q - v) - 0.5) <= 0) { if (std::abs(std::abs(q - v) - 0.5) <= 0) {
if (abs(q) % 2 != 0) { if (std::abs(q) % 2 != 0) {
q = q + ((q > 0) ? -1 : 1); q = q + ((q > 0) ? -1 : 1);
} }
} }
......
...@@ -180,10 +180,10 @@ inline void DepthwiseConv3x3(const ConvParam<CPU> &param) { ...@@ -180,10 +180,10 @@ inline void DepthwiseConv3x3(const ConvParam<CPU> &param) {
Tensor in_batch = input->Slice(i, i + 1); Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1); Tensor out_batch = output->Slice(i, i + 1);
if (strides[0] == 1) { if (strides[0] == 1) {
math::DepthwiseConv3x3s1<Itype, Otype>(in_batch, *filter, paddings, math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
&out_batch); &out_batch);
} else if (strides[0] == 2) { } else if (strides[0] == 2) {
math::DepthwiseConv3x3s2<Itype, Otype>(in_batch, *filter, paddings, math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
&out_batch); &out_batch);
} else { } else {
// math::DepthwiseConv3x3<Itype, Otype>(input_pad, *filter, // math::DepthwiseConv3x3<Itype, Otype>(input_pad, *filter,
......
...@@ -74,13 +74,13 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, ...@@ -74,13 +74,13 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
// framework::Tensor *output); // framework::Tensor *output);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void DepthwiseConv3x3s1(const framework::Tensor &input, void DepthwiseConv3x3S1(const framework::Tensor &input,
const framework::Tensor &filter, const framework::Tensor &filter,
const std::vector<int> &paddings, const std::vector<int> &paddings,
framework::Tensor *output); framework::Tensor *output);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void DepthwiseConv3x3s2(const framework::Tensor &input, void DepthwiseConv3x3S2(const framework::Tensor &input,
const framework::Tensor &filter, const framework::Tensor &filter,
const std::vector<int> &paddings, const std::vector<int> &paddings,
framework::Tensor *output); framework::Tensor *output);
......
...@@ -12,12 +12,300 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,300 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(__ARM_NEON__) && !defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h" #include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <int Stride>
inline void Depth3x3ValidColLoadInput(const int8_t *input, const int input_w,
const int valid_cols, int16x8_t *y0,
int16x8_t *y1, int16x8_t *y2) {
PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride);
}
template <>
inline void Depth3x3ValidColLoadInput<1>(const int8_t *input, const int input_w,
const int valid_cols, int16x8_t *y0,
int16x8_t *y1, int16x8_t *y2) {
int8_t fake_input[3][8];
if (valid_cols == 1) {
for (int i = 0; i < 8; ++i, input += input_w) {
fake_input[0][i] = input[0];
}
} else if (valid_cols == 2) {
for (int i = 0; i < 8; ++i, input += input_w) {
fake_input[0][i] = input[0];
fake_input[1][i] = input[1];
}
} else {
for (int i = 0; i < 8; ++i, input += input_w) {
fake_input[0][i] = input[0];
fake_input[1][i] = input[1];
fake_input[2][i] = input[2];
}
}
int8x8_t input0 = vld1_s8(fake_input[0]);
int8x8_t input1 = vld1_s8(fake_input[1]);
int8x8_t input2 = vld1_s8(fake_input[2]);
y0[0] = vmovl_s8(input0);
y1[0] = vmovl_s8(input1);
y2[0] = vmovl_s8(input2);
y0[1] = vextq_s16(y0[0], y0[0], 1);
y0[2] = vextq_s16(y0[0], y0[0], 2);
y1[1] = vextq_s16(y1[0], y1[0], 1);
y1[2] = vextq_s16(y1[0], y1[0], 2);
y2[1] = vextq_s16(y2[0], y2[0], 1);
y2[2] = vextq_s16(y2[0], y2[0], 2);
}
template <>
inline void Depth3x3ValidColLoadInput<2>(const int8_t *input, const int input_w,
const int valid_cols, int16x8_t *y0,
int16x8_t *y1, int16x8_t *y2) {
int8_t fake_input[3][13];
if (valid_cols == 1) {
for (int i = 0; i < 13; ++i, input += input_w) {
fake_input[0][i] = input[0];
}
} else if (valid_cols == 2) {
for (int i = 0; i < 13; ++i, input += input_w) {
fake_input[0][i] = input[0];
fake_input[1][i] = input[1];
}
} else {
for (int i = 0; i < 13; ++i, input += input_w) {
fake_input[0][i] = input[0];
fake_input[1][i] = input[1];
fake_input[2][i] = input[2];
}
}
int8x8x2_t input0 = vld2_s8(fake_input[0]);
int8x8x2_t input1 = vld2_s8(fake_input[1]);
int8x8x2_t input2 = vld2_s8(fake_input[2]);
y0[0] = vmovl_s8(input0.val[0]);
y0[1] = vmovl_s8(input0.val[1]);
y0[2] = vextq_s16(y0[0], y0[0], 1);
y1[0] = vmovl_s8(input1.val[0]);
y1[1] = vmovl_s8(input1.val[1]);
y1[2] = vextq_s16(y1[0], y1[0], 1);
y2[0] = vmovl_s8(input2.val[0]);
y2[1] = vmovl_s8(input2.val[1]);
y2[2] = vextq_s16(y2[0], y2[0], 1);
}
template <int Stride_h, int Stride_w>
inline void DepthwiseConv3x3ValidCol(const int8_t *input, const int8_t *filter,
const int h_output, const int h_output_end,
const int w_output, const int input_h,
const int input_w, const int padding_h,
const int padding_w, const int output_w,
int32_t *output) {
const int w_in_start = -padding_w + w_output * Stride_w;
const int w_in_end = w_in_start + 3;
const int w_start = w_in_start > 0 ? w_in_start : 0;
const int w_end = w_in_end < input_w ? w_in_end : input_w;
int remain_start = h_output;
#ifdef __ARM_NEON__
int output_tiles = (h_output_end - h_output) / 6;
remain_start = h_output + output_tiles * 6;
int input_h_start = h_output * Stride_h - padding_h;
size_t input_offset = input_h_start * input_w + w_start;
size_t output_offset = h_output * output_w + w_output;
int16x8_t _input[3][3];
int16x4_t _kernel[3];
int32x4_t _sum0, _sum1;
const int8_t *filter_ptr = filter;
asm volatile(
"mov r0, #3 \n"
"vld1.s8 d10, [%[filter]], r0 \n"
"vld1.s8 d11, [%[filter]], r0 \n"
"vld1.s8 d12, [%[filter]] \n"
"vtrn.8 d10, d11 \n"
"vtrn.8 d12, d13 \n"
"vtrn.16 d10, d12 \n"
"vtrn.16 d11, d13 \n"
"vmovl.s8 q7, d10 \n"
"vmovl.s8 q8, d11 \n"
"vmovl.s8 q9, d12 \n"
"vmov.32 %[_kernel0], d14 \n"
"vmov.32 %[_kernel1], d16 \n"
"vmov.32 %[_kernel2], d18 \n"
: [_kernel0] "+w"(_kernel[0]), [_kernel1] "+w"(_kernel[1]),
[_kernel2] "+w"(_kernel[2])
: [filter] "r"(filter_ptr)
: "memory", "q5", "q6", "q7", "q8", "q9", "r0");
int valid_cols = w_end - w_start;
for (int h = 0; h < output_tiles * 6; h += 6) {
int32_t *output0 = output + output_offset;
int32_t *output1 = output0 + output_w;
int32_t *output2 = output1 + output_w;
int32_t *output3 = output2 + output_w;
int32_t *output4 = output3 + output_w;
int32_t *output5 = output4 + output_w;
Depth3x3ValidColLoadInput<Stride_w>(input + input_offset, input_w,
valid_cols, _input[0], _input[1],
_input[2]);
_sum0 = veorq_s32(_sum0, _sum0);
_sum1 = veorq_s32(_sum1, _sum1);
for (int w_in = 0; w_in < valid_cols; ++w_in) {
int index = w_in + w_start - w_in_start;
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][0]),
_kernel[index], 0);
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][1]),
_kernel[index], 1);
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][2]),
_kernel[index], 2);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][0]),
_kernel[index], 0);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][1]),
_kernel[index], 1);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][2]),
_kernel[index], 2);
}
vst1q_lane_s32(output0, _sum0, 0);
vst1q_lane_s32(output1, _sum0, 1);
vst1q_lane_s32(output2, _sum0, 2);
vst1q_lane_s32(output3, _sum0, 3);
vst1q_lane_s32(output4, _sum1, 0);
vst1q_lane_s32(output5, _sum1, 1);
input_offset += 6 * Stride_h * input_w;
output_offset += 6 * output_w;
}
#endif
for (int h = remain_start; h < h_output_end; ++h) {
int32_t value = 0;
const int h_in_start = -padding_h + h * Stride_h;
for (int i = 0; i < 3; ++i) {
for (int w_in = w_start; w_in < w_end; ++w_in) {
value += filter[i * 3 + (w_in - w_in_start)] *
input[(h_in_start + i) * input_w + w_in];
}
}
output[h * output_w + w_output] = value;
}
}
#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \
for (int w = start; w < end; ++w) { \
const int w_in_start = -padding_w + w * Stride_w; \
const int w_in_end = w_in_start + 3; \
const int w_start = w_in_start > 0 ? w_in_start : 0; \
const int w_end = w_in_end < input_w ? w_in_end : input_w; \
int32_t value = 0; \
for (int h_in = h_start; h_in < h_end; ++h_in) { \
for (int w_in = w_start; w_in < w_end; ++w_in) { \
value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \
input[h_in * input_w + w_in]; \
} \
} \
output_ptr[w] = value; \
}
template <int Stride>
inline void Depth3x3NormalRowLoadInput(const int8_t *input,
int16x8_t &y0, // NOLINT
int16x8_t &y1, // NOLINT
int16x8_t &y2) { // NOLINT
PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride);
}
template <>
inline void Depth3x3NormalRowLoadInput<1>(const int8_t *input,
int16x8_t &y0, // NOLINT
int16x8_t &y1, // NOLINT
int16x8_t &y2) { // NOLINT
int8x8_t x0 = vld1_s8(input);
y0 = vmovl_s8(x0);
y1 = vextq_s16(y0, y0, 1);
y2 = vextq_s16(y1, y1, 1);
}
template <>
inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input,
int16x8_t &y0, // NOLINT
int16x8_t &y1, // NOLINT
int16x8_t &y2) { // NOLINT
int8x8x2_t x0 = vld2_s8(input);
y0 = vmovl_s8(x0.val[0]);
y1 = vmovl_s8(x0.val[1]);
y2 = vextq_s16(y0, y0, 1);
}
template <int Stride_h, int Stride_w>
inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
const int h_output, const int input_h,
const int input_w, const int padding_h,
const int padding_w, const int output_w,
int32_t *output) {
const int h_in_start = -padding_h + h_output * Stride_h;
const int h_in_end = h_in_start + 3;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int h_end = h_in_end < input_h ? h_in_end : input_h;
int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
int valid_w_end = output_w - valid_w_start;
int32_t *output_ptr = output + h_output * output_w;
// border left
DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
// middle
int remain_start = valid_w_start;
#ifdef __ARM_NEON__
int output_tiles = (valid_w_end - valid_w_start) / 6;
remain_start = valid_w_start + output_tiles * 6;
int32x4_t _sum0, _sum1;
int16x8_t y0, y1, y2;
int16x4_t _kernel[3];
for (int h_in = h_start; h_in < h_end; ++h_in) {
int index = h_in - h_in_start;
int8x8_t w0 = vld1_s8(filter + index * 3);
int16x8_t w1 = vmovl_s8(w0);
_kernel[index] = vget_low_s16(w1);
}
for (int w = 0; w < output_tiles * 6; w += 6) {
_sum0 = veorq_s32(_sum0, _sum0);
_sum1 = veorq_s32(_sum1, _sum1);
int output_offset = valid_w_start + w;
int input_w_offset = output_offset * Stride_w - padding_w;
for (int h_in = h_start; h_in < h_end; ++h_in) {
int index = h_in - h_in_start;
Depth3x3NormalRowLoadInput<Stride_w>(
input + h_in * input_w + input_w_offset, y0, y1, y2);
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y0), _kernel[index], 0);
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y1), _kernel[index], 1);
_sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y2), _kernel[index], 2);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y0), _kernel[index], 0);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y1), _kernel[index], 1);
_sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y2), _kernel[index], 2);
}
vst1q_s32(output_ptr + output_offset, _sum0);
vst1q_lane_s32(output_ptr + output_offset + 4, _sum1, 0);
vst1q_lane_s32(output_ptr + output_offset + 5, _sum1, 1);
}
#endif
for (int w = remain_start; w < valid_w_end; ++w) {
int32_t value = 0;
int input_start = -padding_w + w * Stride_w;
for (int h_in = h_start; h_in < h_end; ++h_in) {
for (int j = 0; j < 3; ++j) {
value += filter[(h_in - h_in_start) * 3 + j] *
input[h_in * input_w + j + input_start];
}
}
output_ptr[w] = value;
}
// border right
DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w)
}
// template<> // template<>
// void DepthwiseConv3x3<int8_t, int32_t>( // void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter, // const framework::Tensor *input, const framework::Tensor *filter,
...@@ -27,44 +315,72 @@ namespace math { ...@@ -27,44 +315,72 @@ namespace math {
// } // }
template <> template <>
void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter, const framework::Tensor &filter,
const std::vector<int> &paddings, const std::vector<int> &paddings,
framework::Tensor *output) { framework::Tensor *output) {
const int8_t *input_data = input.data<int8_t>(); const int8_t *input_data = input.data<int8_t>();
const int8_t *filter_data = filter.data<int8_t>(); const int8_t *filter_data = filter.data<int8_t>();
int32_t *out_data = output->mutable_data<int32_t>(); int32_t *out_data = output->mutable_data<int32_t>();
// make sure that batch size is 1
int input_c = input.dims()[1];
int input_h = input.dims()[2]; int input_h = input.dims()[2];
int input_w = input.dims()[3]; int input_w = input.dims()[3];
int output_c = output->dims()[1];
int output_h = output->dims()[2]; int output_h = output->dims()[2];
int output_w = output->dims()[3]; int output_w = output->dims()[3];
int padding_h = paddings[0];
int padding_w = paddings[1];
int image_size = input_h * input_w; int image_size = input_h * input_w;
int out_image_size = output_h * output_w; int out_image_size = output_h * output_w;
#if __aarch64__ int valid_h_start = padding_h;
// TODO(hjchen2) int valid_h_end = output_h - valid_h_start;
#else int valid_h = valid_h_end - valid_h_start;
int valid_w_start = padding_w;
int valid_w_end = output_w - valid_w_start;
int valid_w = valid_w_end - valid_w_start;
#pragma omp parallel for #pragma omp parallel for
for (int g = 0; g < input_c; ++g) { for (int g = 0; g < input.dims()[1]; ++g) {
const int8_t* input_ptr = input_data + g * image_size; const int8_t *input_ptr = input_data + g * image_size;
const int8_t* filter_ptr = filter_data + g * 9; const int8_t *filter_ptr = filter_data + g * 9;
int32_t* output_ptr = out_data + g * out_image_size; int32_t *output_ptr = out_data + g * out_image_size;
int loops = (input_w - 2) / 6; // top
int remain = input_w - 2 - loops * 6; for (int h = 0; h < valid_h_start; ++h) {
for (int h = 0; h < input_h - 5 /*(input_h - 2) - 3*/; h += 4) { DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
const int8_t* input_ptr0 = input_ptr + h * input_w; input_w, padding_h, padding_w, output_w,
const int8_t* input_ptr1 = input_ptr0 + input_w; output_ptr);
const int8_t* input_ptr2 = input_ptr1 + input_w; }
const int8_t* input_ptr3 = input_ptr2 + input_w; // left
const int8_t* input_ptr4 = input_ptr3 + input_w; for (int w = 0; w < valid_w_start; ++w) {
const int8_t* input_ptr5 = input_ptr4 + input_w; DepthwiseConv3x3ValidCol<1, 1>(
int32_t* output_ptr0 = output_ptr + h * output_w; input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h,
int32_t* output_ptr1 = output_ptr0 + output_w; input_w, padding_h, padding_w, output_w, output_ptr);
int32_t* output_ptr2 = output_ptr1 + output_w; }
int32_t* output_ptr3 = output_ptr2 + output_w; // right
int loop = loops; for (int w = valid_w_end; w < output_w; ++w) {
DepthwiseConv3x3ValidCol<1, 1>(
input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h,
input_w, padding_h, padding_w, output_w, output_ptr);
}
// bottom
for (int h = valid_h_end; h < output_h; ++h) {
DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
input_w, padding_h, padding_w, output_w,
output_ptr);
}
// valid
int output_w_tiles = valid_w / 6;
int output_w_remain = valid_w - output_w_tiles * 6;
for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
const int8_t *input_ptr1 = input_ptr0 + input_w;
const int8_t *input_ptr2 = input_ptr1 + input_w;
const int8_t *input_ptr3 = input_ptr2 + input_w;
const int8_t *input_ptr4 = input_ptr3 + input_w;
const int8_t *input_ptr5 = input_ptr4 + input_w;
int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start;
int32_t *output_ptr1 = output_ptr0 + output_w;
int32_t *output_ptr2 = output_ptr1 + output_w;
int32_t *output_ptr3 = output_ptr2 + output_w;
int loop = output_w_tiles;
asm volatile( asm volatile(
"vld1.32 {q0}, [%[filter_ptr]] \n" "vld1.32 {q0}, [%[filter_ptr]] \n"
"vmovl.s8 q14, d0 \n" "vmovl.s8 q14, d0 \n"
...@@ -385,20 +701,20 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, ...@@ -385,20 +701,20 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input,
[input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
[input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
[loop] "+r"(loop) [loop] "+r"(loop)
: [remain] "r"(remain) : [remain] "r"(output_w_remain)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0");
} }
// remain height // remain height
int start_h = (input_h - 2) & 0xFFFC; int start_h = valid_h_start + (valid_h & 0xFFFC);
for (int h = start_h; h < input_h - 3 /*(input_h - 2) - 1*/; h += 2) { for (int h = start_h; h < valid_h_end - 1; h += 2) {
const int8_t* input_ptr0 = input_ptr + h * input_w; const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
const int8_t* input_ptr1 = input_ptr0 + input_w; const int8_t *input_ptr1 = input_ptr0 + input_w;
const int8_t* input_ptr2 = input_ptr1 + input_w; const int8_t *input_ptr2 = input_ptr1 + input_w;
const int8_t* input_ptr3 = input_ptr2 + input_w; const int8_t *input_ptr3 = input_ptr2 + input_w;
int32_t* output_ptr0 = output_ptr + h * output_w; int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start;
int32_t* output_ptr1 = output_ptr0 + output_w; int32_t *output_ptr1 = output_ptr0 + output_w;
int loop = loops; int loop = output_w_tiles;
asm volatile( asm volatile(
"vld1.32 {q0}, [%[filter_ptr]] \n" "vld1.32 {q0}, [%[filter_ptr]] \n"
"vmovl.s8 q14, d0 \n" "vmovl.s8 q14, d0 \n"
...@@ -416,9 +732,9 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, ...@@ -416,9 +732,9 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input,
: [filter_ptr] "r"(filter_ptr) : [filter_ptr] "r"(filter_ptr)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15");
asm volatile( asm volatile(
"mov r0, #6 \n"
"cmp %[loop], #0 \n" "cmp %[loop], #0 \n"
"ble start_remain_%= \n" "ble start_remain_%= \n"
"mov r0, #6 \n"
// loop 6 widths // loop 6 widths
"loop_2h6w_%=: \n" "loop_2h6w_%=: \n"
"vld1.32 {d9}, [%[input_ptr0]], r0 \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n"
...@@ -595,18 +911,18 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, ...@@ -595,18 +911,18 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input,
[input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
[input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
[loop] "+r"(loop) [loop] "+r"(loop)
: [remain] "r"(remain) : [remain] "r"(output_w_remain)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "r0"); "q8", "q9", "q10", "q11", "q12", "q13", "r0");
} }
start_h = (input_h - 2) & 0xFFFE; start_h = valid_h_start + (valid_h & 0xFFFE);
if (start_h < input_h - 2) { if (start_h < valid_h_end) {
const int8_t* input_ptr0 = input_ptr + start_h * input_w; const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
const int8_t* input_ptr1 = input_ptr0 + input_w; const int8_t *input_ptr1 = input_ptr0 + input_w;
const int8_t* input_ptr2 = input_ptr1 + input_w; const int8_t *input_ptr2 = input_ptr1 + input_w;
int32_t* output_ptr0 = output_ptr + start_h * output_w; int32_t *output_ptr0 = output_ptr + start_h * output_w + valid_w_start;
int loop = loops; int loop = output_w_tiles;
asm volatile( asm volatile(
"vld1.32 {q0}, [%[filter_ptr]] \n" "vld1.32 {q0}, [%[filter_ptr]] \n"
"vmovl.s8 q14, d0 \n" "vmovl.s8 q14, d0 \n"
...@@ -624,9 +940,9 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, ...@@ -624,9 +940,9 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input,
: [filter_ptr] "r"(filter_ptr) : [filter_ptr] "r"(filter_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15");
asm volatile( asm volatile(
"mov r0, #6 \n"
"cmp %[loop], #0 \n" "cmp %[loop], #0 \n"
"ble start_remain_%= \n" "ble start_remain_%= \n"
"mov r0, #6 \n"
// loop 6 widths // loop 6 widths
"loop_1h6w_%=: \n" "loop_1h6w_%=: \n"
"vld1.32 {d9}, [%[input_ptr0]], r0 \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n"
...@@ -741,53 +1057,87 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input, ...@@ -741,53 +1057,87 @@ void DepthwiseConv3x3s1<int8_t, int32_t>(const framework::Tensor &input,
: [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
[input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
[loop] "+r"(loop) [loop] "+r"(loop)
: [remain] "r"(remain) : [remain] "r"(output_w_remain)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "r0"); "q8", "q9", "q10", "q11", "r0");
} }
} }
#endif // __aarch64__
} }
template <> template <>
void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input, void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter, const framework::Tensor &filter,
const std::vector<int> &paddings, const std::vector<int> &paddings,
framework::Tensor *output) { framework::Tensor *output) {
const int8_t *input_data = input.data<int8_t>(); const int8_t *input_data = input.data<int8_t>();
const int8_t *filter_data = filter.data<int8_t>(); const int8_t *filter_data = filter.data<int8_t>();
int32_t *out_data = output->mutable_data<int32_t>(); int32_t *out_data = output->mutable_data<int32_t>();
// make sure that batch size is 1
int input_c = input.dims()[1];
int input_h = input.dims()[2]; int input_h = input.dims()[2];
int input_w = input.dims()[3]; int input_w = input.dims()[3];
int output_c = output->dims()[1];
int output_h = output->dims()[2]; int output_h = output->dims()[2];
int output_w = output->dims()[3]; int output_w = output->dims()[3];
int padding_h = paddings[0];
int padding_w = paddings[1];
int image_size = input_h * input_w; int image_size = input_h * input_w;
int out_image_size = output_h * output_w; int out_image_size = output_h * output_w;
#if __aarch64__ int valid_h_start = (padding_h + 1) / 2;
// TODO(hjchen2) int valid_h_end = output_h - valid_h_start;
#else int valid_h = valid_h_end - valid_h_start;
int valid_w_start = (padding_w + 1) / 2;
int valid_w_end = output_w - valid_w_start;
int valid_w = valid_w_end - valid_w_start;
// DLOG << "valid_h_start: " << valid_h_start;
// DLOG << "valid_h_end: " << valid_h_end;
// DLOG << "valid_w_start: " << valid_w_start;
// DLOG << "valid_w_end: " << valid_w_end;
#pragma omp parallel for #pragma omp parallel for
for (int g = 0; g < input_c; ++g) { for (int g = 0; g < input.dims()[1]; ++g) {
const int8_t* input_ptr = input_data + g * image_size; const int8_t *input_ptr = input_data + g * image_size;
const int8_t* filter_ptr = filter_data + g * 9; const int8_t *filter_ptr = filter_data + g * 9;
int32_t* output_ptr = out_data + g * out_image_size; int32_t *output_ptr = out_data + g * out_image_size;
int loops = output_w / 6; // top
int remain = output_w - loops * 6; for (int h = 0; h < valid_h_start; ++h) {
for (int h = 0; h < input_h - 6 /*(input_h - 1) - 5*/; h += 6) { DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
const int8_t* input_ptr0 = input_ptr + h * input_w; input_w, padding_h, padding_w, output_w,
const int8_t* input_ptr1 = input_ptr0 + input_w; output_ptr);
const int8_t* input_ptr2 = input_ptr1 + input_w; }
const int8_t* input_ptr3 = input_ptr2 + input_w; // left
const int8_t* input_ptr4 = input_ptr3 + input_w; for (int w = 0; w < valid_w_start; ++w) {
const int8_t* input_ptr5 = input_ptr4 + input_w; DepthwiseConv3x3ValidCol<2, 2>(
const int8_t* input_ptr6 = input_ptr5 + input_w; input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h,
int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; input_w, padding_h, padding_w, output_w, output_ptr);
int32_t* output_ptr1 = output_ptr0 + output_w; }
int32_t* output_ptr2 = output_ptr1 + output_w; // right
int loop = loops; for (int w = valid_w_end; w < output_w; ++w) {
DepthwiseConv3x3ValidCol<2, 2>(
input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h,
input_w, padding_h, padding_w, output_w, output_ptr);
}
// bottom
for (int h = valid_h_end; h < output_h; ++h) {
DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
input_w, padding_h, padding_w, output_w,
output_ptr);
}
// valid
int input_w_start = 2 * valid_w_start - padding_w;
int output_w_tiles = valid_w / 6;
int output_w_remain = valid_w - output_w_tiles * 6;
for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
size_t offset = (2 * h - padding_h) * input_w + input_w_start;
const int8_t *input_ptr0 = input_ptr + offset;
const int8_t *input_ptr1 = input_ptr0 + input_w;
const int8_t *input_ptr2 = input_ptr1 + input_w;
const int8_t *input_ptr3 = input_ptr2 + input_w;
const int8_t *input_ptr4 = input_ptr3 + input_w;
const int8_t *input_ptr5 = input_ptr4 + input_w;
const int8_t *input_ptr6 = input_ptr5 + input_w;
int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start;
int32_t *output_ptr1 = output_ptr0 + output_w;
int32_t *output_ptr2 = output_ptr1 + output_w;
int loop = output_w_tiles;
asm volatile( asm volatile(
"vld1.32 {q0}, [%[filter_ptr]] \n" "vld1.32 {q0}, [%[filter_ptr]] \n"
"vmovl.s8 q14, d0 \n" "vmovl.s8 q14, d0 \n"
...@@ -805,9 +1155,9 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input, ...@@ -805,9 +1155,9 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input,
: [filter_ptr] "r"(filter_ptr) : [filter_ptr] "r"(filter_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15");
asm volatile( asm volatile(
"mov r0, #12 \n"
"cmp %[loop], #0 \n" "cmp %[loop], #0 \n"
"ble start_remain_%= \n" "ble start_remain_%= \n"
"mov r0, #12 \n"
// loop 6 widths // loop 6 widths
"loop_3h6w_%=: \n" "loop_3h6w_%=: \n"
"vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n"
...@@ -1057,18 +1407,19 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input, ...@@ -1057,18 +1407,19 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input,
[input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
[input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
[loop] "+r"(loop) [loop] "+r"(loop)
: [remain] "r"(remain) : [remain] "r"(output_w_remain)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0");
} }
int start_h = (output_h / 3) * 6; int start_h = valid_h_start + valid_h / 3 * 3;
for (int h = start_h; h < input_h - 2 /*(input_h - 1) - 1*/; h += 2) { for (int h = start_h; h < valid_h_end; ++h) {
const int8_t* input_ptr0 = input_ptr + h * input_w; size_t offset = (2 * h - padding_h) * input_w + input_w_start;
const int8_t* input_ptr1 = input_ptr0 + input_w; const int8_t *input_ptr0 = input_ptr + offset;
const int8_t* input_ptr2 = input_ptr1 + input_w; const int8_t *input_ptr1 = input_ptr0 + input_w;
int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; const int8_t *input_ptr2 = input_ptr1 + input_w;
int loop = loops; int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start;
int loop = output_w_tiles;
asm volatile( asm volatile(
"vld1.32 {q0}, [%[filter_ptr]] \n" "vld1.32 {q0}, [%[filter_ptr]] \n"
"vmovl.s8 q14, d0 \n" "vmovl.s8 q14, d0 \n"
...@@ -1086,9 +1437,9 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input, ...@@ -1086,9 +1437,9 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input,
: [filter_ptr] "r"(filter_ptr) : [filter_ptr] "r"(filter_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15");
asm volatile( asm volatile(
"mov r0, #12 \n"
"cmp %[loop], #0 \n" "cmp %[loop], #0 \n"
"ble start_remain_%= \n" "ble start_remain_%= \n"
"mov r0, #12 \n"
// loop 6 widths // loop 6 widths
"loop_1h6w_%=: \n" "loop_1h6w_%=: \n"
"vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n"
...@@ -1196,14 +1547,15 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input, ...@@ -1196,14 +1547,15 @@ void DepthwiseConv3x3s2<int8_t, int32_t>(const framework::Tensor &input,
: [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
[input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
[loop] "+r"(loop) [loop] "+r"(loop)
: [remain] "r"(remain) : [remain] "r"(output_w_remain)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "r0"); "q8", "q9", "q10", "q11", "q12", "r0");
} }
} }
#endif // __aarch64__
} }
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册