提交 3360e9cd 编写于 作者: L Liu Yiqun

Change the definition of vmlaq_laneq_f32 from template function to macro.

上级 a98c9e6b
...@@ -4,7 +4,7 @@ cache: ...@@ -4,7 +4,7 @@ cache:
- $HOME/.ccache - $HOME/.ccache
- $HOME/.cache/pip - $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party - $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build/third_party_android - $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required sudo: required
dist: trusty dist: trusty
os: os:
......
...@@ -11,8 +11,8 @@ ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} ...@@ -11,8 +11,8 @@ ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV HOME=/root \ ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \ ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain-gcc \ ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain-gcc ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include "GemmFunctor.h" #include "GemmFunctor.h"
#include "GruFunctor.h"
#include "hl_cpu_gru.cuh" #include "hl_cpu_gru.cuh"
namespace paddle { namespace paddle {
......
...@@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { ...@@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { ...@@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { ...@@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
...@@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { ...@@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> {
float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2); tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1); vst1q_f32(outputData, tmp1);
......
...@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) { ...@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
return vget_lane_f32(vpadd_f32(v, v), 0); return vget_lane_f32(vpadd_f32(v, v), 0);
} }
template <int lane> #define vmlaq_laneq_f32(a, b, v, lane) \
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
float32x4_t b,
float32x4_t v) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
}
#endif #endif
} // namespace neon } // namespace neon
......
...@@ -36,6 +36,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then ...@@ -36,6 +36,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-DUSE_EIGEN_FOR_BLAS=OFF \ -DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
.. ..
elif [ $ANDROID_ABI == "armeabi" ]; then elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \ cmake -DCMAKE_SYSTEM_NAME=Android \
...@@ -48,10 +49,11 @@ elif [ $ANDROID_ABI == "armeabi" ]; then ...@@ -48,10 +49,11 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
.. ..
else else
echo "Invalid ANDROID_ABI: $ANDROID_ABI" echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi fi
make VERBOSE=1 -j2 make -j `nproc`
make install -j2 make install -j `nproc`
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册