提交 9f1d1bf4 编写于 作者: Y Yiqun Liu 提交者: GitHub

Merge pull request #4436 from Xreki/fix_neon_memory_out_of_range

Use scalar implementation instead of neon implementation to avoid out of range memory access in the tail conv3x3
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include "neon_util.h" #include "neon_util.h"
namespace paddle { namespace paddle {
namespace neon { namespace neon {
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
...@@ -26,17 +25,20 @@ namespace neon { ...@@ -26,17 +25,20 @@ namespace neon {
template <int filterSize, int stride> template <int filterSize, int stride>
struct DepthwiseConvKernel {}; struct DepthwiseConvKernel {};
inline float32_t conv3x3(float32x4_t r0, inline float32_t conv3x3(const float* r0,
float32x4_t r1, const float* r1,
float32x4_t r2, const float* r2,
float32x4_t k0, float32x4_t k0,
float32x4_t k1, float32x4_t k1,
float32x4_t k2) { float32x4_t k2) {
float32x4_t tmp; float32_t tmp[12];
tmp = vmulq_f32(r0, k0); vst1q_f32(&(tmp[0]), k0);
tmp = vmlaq_f32(tmp, r1, k1); vst1q_f32(&(tmp[4]), k1);
tmp = vmlaq_f32(tmp, r2, k2); vst1q_f32(&(tmp[8]), k2);
return vaddvq_f32(tmp); float32_t sum0 = r0[0] * tmp[0] + r0[1] * tmp[1] + r0[2] * tmp[2];
float32_t sum1 = r1[0] * tmp[4] + r1[1] * tmp[5] + r1[2] * tmp[6];
float32_t sum2 = r2[0] * tmp[8] + r2[1] * tmp[9] + r2[2] * tmp[10];
return sum0 + sum1 + sum2;
} }
inline float32_t conv4x4(float32x4_t r0, inline float32_t conv4x4(float32x4_t r0,
...@@ -136,10 +138,7 @@ struct DepthwiseConvKernel<3, 1> { ...@@ -136,10 +138,7 @@ struct DepthwiseConvKernel<3, 1> {
} }
for (int r = 0; r < remain; r++) { for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0); *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0++; r0++;
r1++; r1++;
r2++; r2++;
...@@ -243,10 +242,7 @@ struct DepthwiseConvKernel<3, 2> { ...@@ -243,10 +242,7 @@ struct DepthwiseConvKernel<3, 2> {
} }
for (int r = 0; r < remain; r++) { for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0); *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0 += 2; r0 += 2;
r1 += 2; r1 += 2;
r2 += 2; r2 += 2;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册