From 3360e9cdb8151baa33c3e82840fae2d105085a46 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 7 Sep 2017 11:06:32 +0800 Subject: [PATCH] Change the definition of vmlaq_laneq_f32 from template function to macro. --- .travis.yml | 2 +- Dockerfile.android | 4 +- paddle/function/GruFunctor.h | 1 - paddle/function/neon/NeonDepthwiseConv.cpp | 100 ++++++++++----------- paddle/function/neon/neon_util.h | 8 +- paddle/scripts/docker/build_android.sh | 6 +- 6 files changed, 59 insertions(+), 62 deletions(-) diff --git a/.travis.yml b/.travis.yml index 14a39c58de..b4b83fcdbc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party - - $TRAVIS_BUILD_DIR/build/third_party_android + - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: diff --git a/Dockerfile.android b/Dockerfile.android index 6013215d9d..452aa15745 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -11,8 +11,8 @@ ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain-gcc \ - ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain-gcc + ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \ + ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain RUN apt-get update && \ apt-get install -y \ diff --git a/paddle/function/GruFunctor.h b/paddle/function/GruFunctor.h index 11f6174dbd..9f6392198e 100644 --- a/paddle/function/GruFunctor.h +++ b/paddle/function/GruFunctor.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "GemmFunctor.h" -#include "GruFunctor.h" #include "hl_cpu_gru.cuh" namespace paddle { diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index 14e5198e1b..f09e98587d 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); - tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); - tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); - tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); - tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h index dbe017170b..e2db045067 100644 --- a/paddle/function/neon/neon_util.h +++ b/paddle/function/neon/neon_util.h @@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) { return vget_lane_f32(vpadd_f32(v, v), 0); } -template -inline float32x4_t vmlaq_laneq_f32(float32x4_t a, - float32x4_t b, - float32x4_t v) { - return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); -} +#define vmlaq_laneq_f32(a, b, v, lane) \ + vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)) #endif } // namespace neon diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 512a37166c..aabd2da5e4 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -36,6 +36,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -48,10 +49,11 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make VERBOSE=1 -j2 -make install -j2 +make -j `nproc` +make install -j `nproc` -- GitLab