提交 c54c7d91 编写于 作者: L Liu Yiqun

Use template to deliver const argument instead, to remove the compiling error...

Use template to deliver const argument instead, to remove the compiling error "argument to __builtin_neon_vgetq_lane_f32 must be a constant integer".
上级 50c63dc9
...@@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { ...@@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { ...@@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { ...@@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { ...@@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
......
...@@ -33,10 +33,10 @@ inline float32_t vaddvq_f32(float32x4_t a) { ...@@ -33,10 +33,10 @@ inline float32_t vaddvq_f32(float32x4_t a) {
return vget_lane_f32(vpadd_f32(v, v), 0); return vget_lane_f32(vpadd_f32(v, v), 0);
} }
template <int lane>
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
float32x4_t b, float32x4_t b,
float32x4_t v, float32x4_t v) {
const int lane) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册