diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index f09e98587d1681d29a79a9cb0303c2d4356c6935..14e5198e1bcdfb3101bc4d1c39b542757fbd7603 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); + tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); + tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); + tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); + tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h index 56b3febe2d27bb4fbf57e49079b3ad071d556914..dbe017170bbf8a69bf842cd8b4638aae5e0b5b38 100644 --- a/paddle/function/neon/neon_util.h +++ b/paddle/function/neon/neon_util.h @@ -33,10 +33,10 @@ inline float32_t vaddvq_f32(float32x4_t a) { return vget_lane_f32(vpadd_f32(v, v), 0); } +template inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, - float32x4_t v, - const int lane) { + float32x4_t v) { return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); } #endif