From fb7b353e1fad95a60bcc843f607c50c05d3d9648 Mon Sep 17 00:00:00 2001 From: Bin Li Date: Fri, 4 May 2018 10:58:38 +0800 Subject: [PATCH] fix neon 5x5 7x7 invalid memory read --- mace/kernels/arm/conv_2d_neon_5x5.cc | 20 +++---- mace/kernels/arm/conv_2d_neon_7x7.cc | 84 ++++++++++++++-------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc index c3af5b2a..58dbed34 100644 --- a/mace/kernels/arm/conv_2d_neon_5x5.cc +++ b/mace/kernels/arm/conv_2d_neon_5x5.cc @@ -26,55 +26,55 @@ namespace kernels { float32x4_t vf00, vf10, vf20, vf30; \ float32x2_t vf01, vf11, vf21, vf31; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1_f32(filter_ptr0 + 4); \ + vf01 = vld1_f32(filter_ptr0 + 3); \ vf10 = vld1q_f32(filter_ptr1); \ - vf11 = vld1_f32(filter_ptr1 + 4); \ + vf11 = vld1_f32(filter_ptr1 + 3); \ vf20 = vld1q_f32(filter_ptr2); \ - vf21 = vld1_f32(filter_ptr2 + 4); \ + vf21 = vld1_f32(filter_ptr2 + 3); \ vf30 = vld1q_f32(filter_ptr3); \ - vf31 = vld1_f32(filter_ptr3 + 4); \ + vf31 = vld1_f32(filter_ptr3 + 3); \ \ /* outch 0 */ \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 0); \ + vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \ \ /* outch 1 */ \ vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ - vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 0); \ + vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \ \ /* outch 2 */ \ vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ - vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 0); \ + vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \ \ /* outch 3 */ \ vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ - vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 0); + vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); #define Conv2dNeonK5x5SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00; \ float32x2_t vf01; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1_f32(filter_ptr0 + 4); \ + vf01 = vld1_f32(filter_ptr0 + 3); \ \ /* outch 0 */ \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 0); + vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); inline void Conv2dCPUK5x5Calc(const float *in_ptr_base, const float *filter_ptr0, diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc index dcb9fe20..d1bbecbb 100644 --- a/mace/kernels/arm/conv_2d_neon_7x7.cc +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -28,130 +28,130 @@ namespace kernels { float32x4_t vf20, vf21; \ float32x4_t vf30, vf31; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1q_f32(filter_ptr0 + 4); \ + vf01 = vld1q_f32(filter_ptr0 + 3); \ vf10 = vld1q_f32(filter_ptr1); \ - vf11 = vld1q_f32(filter_ptr1 + 4); \ + vf11 = vld1q_f32(filter_ptr1 + 3); \ vf20 = vld1q_f32(filter_ptr2); \ - vf21 = vld1q_f32(filter_ptr2 + 4); \ + vf21 = vld1q_f32(filter_ptr2 + 3); \ vf30 = vld1q_f32(filter_ptr3); \ - vf31 = vld1q_f32(filter_ptr3 + 4); \ + vf31 = vld1q_f32(filter_ptr3 + 3); \ \ /* outch 0 */ \ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \ vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \ - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 0); \ - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 1); \ - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 2); \ + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \ + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); \ \ /* outch 1 */ \ vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \ vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \ vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \ vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \ - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 0); \ - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 1); \ - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 2); \ + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); \ + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); \ + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); \ \ /* outch 2 */ \ vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \ vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \ vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \ vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \ - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 0); \ - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 1); \ - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 2); \ + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); \ + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); \ + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); \ \ /* outch 3 */ \ vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \ vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \ vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \ - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 0); \ - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 1); \ - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); \ + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \ + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); #define Conv2dArmv8NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1q_f32(filter_ptr0 + 4); \ + vf01 = vld1q_f32(filter_ptr0 + 3); \ \ /* outch 0 */ \ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \ vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \ - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 0); \ - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 1); \ - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \ + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#define Conv2dArmv7NeonK7x7SnLoadCalc4 \ +#define Conv2dArmv7NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ float32x4_t vf10, vf11; \ float32x4_t vf20, vf21; \ float32x4_t vf30, vf31; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1q_f32(filter_ptr0 + 4); \ + vf01 = vld1q_f32(filter_ptr0 + 3); \ vf10 = vld1q_f32(filter_ptr1); \ - vf11 = vld1q_f32(filter_ptr1 + 4); \ + vf11 = vld1q_f32(filter_ptr1 + 3); \ vf20 = vld1q_f32(filter_ptr2); \ - vf21 = vld1q_f32(filter_ptr2 + 4); \ + vf21 = vld1q_f32(filter_ptr2 + 3); \ vf30 = vld1q_f32(filter_ptr3); \ - vf31 = vld1q_f32(filter_ptr3 + 4); \ + vf31 = vld1q_f32(filter_ptr3 + 3); \ \ /* outch 0 */ \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 0); \ - vo0 = vmlaq_lane_f32(vo0, vi5, vget_low_f32(vf01), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); \ \ /* outch 1 */ \ vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 0); \ - vo1 = vmlaq_lane_f32(vo1, vi5, vget_low_f32(vf11), 1); \ - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 0); \ + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); \ + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); \ + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); \ \ /* outch 2 */ \ vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 0); \ - vo2 = vmlaq_lane_f32(vo2, vi5, vget_low_f32(vf21), 1); \ - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 0); \ + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); \ + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); \ + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); \ \ /* outch 3 */ \ vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 0); \ - vo3 = vmlaq_lane_f32(vo3, vi5, vget_low_f32(vf31), 1); \ - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); \ + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \ + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#define Conv2dArmv7NeonK7x7SnLoadCalc1 \ +#define Conv2dArmv7NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ - vf01 = vld1q_f32(filter_ptr0 + 4); \ + vf01 = vld1q_f32(filter_ptr0 + 3); \ \ /* outch 0 */ \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 0); \ - vo0 = vmlaq_lane_f32(vo0, vi5, vget_low_f32(vf01), 1); \ - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); inline void Conv2dCPUK7x7Calc(const float *in_ptr_base, const float *filter_ptr0, -- GitLab