From f241773c4f1803631bba968bca1d5621a0d3ced5 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 23 Aug 2017 19:43:57 +0800 Subject: [PATCH] Support to use clang for Android cross-compiling. --- Dockerfile.android | 4 +- cmake/cblas.cmake | 4 + cmake/external/warpctc.cmake | 1 + paddle/cuda/include/hl_cpu_gru.cuh | 166 ++++++++++++------------- paddle/function/MulOp.cpp | 37 +++--- paddle/math/MathFunctions.cpp | 4 + paddle/math/MathFunctions.h | 23 +++- paddle/math/Matrix.cpp | 18 ++- paddle/scripts/docker/build_android.sh | 51 ++++++-- 9 files changed, 181 insertions(+), 127 deletions(-) diff --git a/Dockerfile.android b/Dockerfile.android index aa95abb36..6013215d9 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -47,8 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-21 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ rm -rf /opt/android-ndk-tmp && \ rm -rf ${ANDROID_NDK_HOME} diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 854066fd1..ab111eccc 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -13,6 +13,10 @@ # system paths. # +if(USE_EIGEN_FOR_BLAS) + return() +endif(USE_EIGEN_FOR_BLAS) + set(CBLAS_FOUND OFF) ## Find MKLML First. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 2d7daed9b..3cc652bed 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ELSE() SET(USE_OMP ON) ENDIF() +SET(USE_OMP OFF FORCE) ExternalProject_Add( extern_warpctc diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index c0a37ced2..732799a28 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/math/MathFunctions.h" -#ifndef PADDLE_TYPE_DOUBLE -#define CBLAS_GEMM paddle::gemm -#else -#define CBLAS_GEMM paddle::gemm -#endif +// #ifndef PADDLE_TYPE_DOUBLE +// #define CBLAS_GEMM paddle::gemm +// #else +// #define CBLAS_GEMM paddle::gemm +// #endif template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, @@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput, hl_activation_mode_t active_node, hl_activation_mode_t active_gate) { if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - 2 * frameSize, - frameSize, - 1, - value.prevOutValue, - frameSize, - value.gateWeight, - frameSize * 2, - 1, - value.gateValue, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// 2 * frameSize, +// frameSize, +// 1, +// value.prevOutValue, +// frameSize, +// value.gateWeight, +// frameSize * 2, +// 1, +// value.gateValue, +// frameSize * 3); } forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - frameSize, - frameSize, - 1, - value.resetOutputValue, - frameSize, - value.stateWeight, - frameSize, - 1, - value.gateValue + frameSize * 2, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// value.resetOutputValue, +// frameSize, +// value.stateWeight, +// frameSize, +// 1, +// value.gateValue + frameSize * 2, +// frameSize * 3); } forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); @@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_node); if (value.prevOutValue && grad.prevOutGrad) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize, - 1, - grad.gateGrad + frameSize * 2, - frameSize * 3, - value.stateWeight, - frameSize, - 0, - grad.resetOutputGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// value.stateWeight, +// frameSize, +// 0, +// grad.resetOutputGrad, +// frameSize); if (grad.stateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize, - batchSize, - 1, - value.resetOutputValue, - frameSize, - grad.gateGrad + frameSize * 2, - frameSize * 3, - 1, - grad.stateWeightGrad, - frameSize); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize, +// batchSize, +// 1, +// value.resetOutputValue, +// frameSize, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// 1, +// grad.stateWeightGrad, +// frameSize); } } @@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_gate); if (grad.prevOutGrad && value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize * 2, - 1, - grad.gateGrad, - frameSize * 3, - value.gateWeight, - frameSize * 2, - 1, - grad.prevOutGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize * 2, +// 1, +// grad.gateGrad, +// frameSize * 3, +// value.gateWeight, +// frameSize * 2, +// 1, +// grad.prevOutGrad, +// frameSize); if (grad.gateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize * 2, - batchSize, - 1, - value.prevOutValue, - frameSize, - grad.gateGrad, - frameSize * 3, - 1, - grad.gateWeightGrad, - frameSize * 2); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize * 2, +// batchSize, +// 1, +// value.prevOutValue, +// frameSize, +// grad.gateGrad, +// frameSize * 3, +// 1, +// grad.gateWeightGrad, +// frameSize * 2); } } } diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 91b4b8ed9..25e41edad 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -13,18 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MulOp.h" -/// todo(tianbing), delete it -#include -#include "paddle/math/MathFunctions.h" +#include "GemmFunctor.h" #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/ThreadLocal.h" -#ifndef PADDLE_TYPE_DOUBLE -#define GEMM paddle::gemm -#else -#define GEMM paddle::gemm -#endif - namespace { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { for (unsigned int i = 0; i < len; ++i) { @@ -114,19 +106,20 @@ void MulOp(CpuMatrix& out, real scaleT, bool aTrans, bool bTrans) { - GEMM(aTrans ? CblasTrans : CblasNoTrans, - bTrans ? CblasTrans : CblasNoTrans, - out.getHeight(), - out.getWidth(), - !aTrans ? a.getWidth() : a.getHeight(), - scaleAB, - a.getData(), - a.getStride(), - b.getData(), - b.getStride(), - scaleT, - out.getData(), - out.getStride()); + BlasGemm::compute( + aTrans, + bTrans, + out.getHeight(), + out.getWidth(), + !aTrans ? a.getWidth() : a.getHeight(), + scaleAB, + a.getData(), + a.getStride(), + b.getData(), + b.getStride(), + scaleT, + out.getData(), + out.getStride()); } /// dense matrix (+)= sparse matrix * dense matrix diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index c8ba1074a..c2f17beeb 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -143,6 +144,7 @@ void gemm(const CBLAS_TRANSPOSE transA, C, ldc); } +#endif template <> int getrf(const CBLAS_ORDER order, @@ -182,6 +184,7 @@ int getri(const CBLAS_ORDER order, return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); } +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); @@ -201,6 +204,7 @@ template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } +#endif #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 637643838..9297ae78c 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -40,7 +40,14 @@ extern "C" { #ifndef LAPACK_FOUND extern "C" { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS #include +#else +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; +#endif int LAPACKE_sgetrf( int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int LAPACKE_dgetrf( @@ -56,6 +63,7 @@ int LAPACKE_dgetri( namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const T beta, T* C, const int ldc); +#endif template int getrf(const CBLAS_ORDER Order, @@ -84,10 +93,20 @@ int getri( const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); template -void axpy(const int n, const T alpha, const T* x, T* y); +void axpy(const int n, const T alpha, const T* x, T* y) { + /// y = y + alpha * x + for (int i = 0; i < n; i++) { + y[i] = y[i] + alpha * x[i]; + } +} template -T dotProduct(const int n, const T* x, const T* y); +T dotProduct(const int n, const T* x, const T* y) { + T result = static_cast(0); + for (int i = 0; i < n; i++) { + result += x[i] * y[i]; + } +} template void vExp(const int n, const T* a, T* r); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 27f7d95b7..fbf3accc9 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" #include "SIMDFunctions.h" @@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - CBLAS_TRANSPOSE a_trans, b_trans; + // CBLAS_TRANSPOSE a_trans, b_trans; + bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - a_trans = CblasNoTrans; + // a_trans = CblasNoTrans; + a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - a_trans = CblasTrans; + // a_trans = CblasTrans; + a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - b_trans = CblasNoTrans; + // b_trans = CblasNoTrans; + b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - b_trans = CblasTrans; + // b_trans = CblasTrans; + b_trans = true; } CHECK_EQ(a_col, b_row); @@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); - gemm( + BlasGemm::compute( a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 593ae28e4..a61c7c40e 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,11 +2,31 @@ set -xe -mkdir -p /paddle/build_android/$ANDROID_ABI -cd /paddle/build_android/$ANDROID_ABI -rm -rf /paddle/install 2>/dev/null || true +COMPILER=gcc +USE_EIGEN=ON +if [ $COMPILER == clang ]; then + SUFFIX=_clang + C_COMPILER=clang + CXX_COMPILER=clang++ +else + SUFFIX=_gcc + C_COMPILER=gcc + CXX_COMPILER=g++ +fi +if [ $USE_EIGEN == ON ]; then + SUFFIX=${SUFFIX}_eigen +else + SUFFIX=${SUFFIX}_openblas +fi -THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI +BUILD_ROOT=/paddle/build_android$SUFFIX +DEST_ROOT=/paddle/install$SUFFIX + +rm -rf $BUILD_ROOT 2>/dev/null || true +mkdir -p $BUILD_ROOT +cd $BUILD_ROOT + +THIRD_PARTY_PATH=/paddle/third_party_android$SUFFIX/$ANDROID_ABI if [ $ANDROID_ABI == "armeabi-v7a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -14,27 +34,34 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-${C_COMPILER} \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-${CXX_COMPILER} \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle -elif [ $ANDROID_ABI == "arm64-v7a" ]; then + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-${C_COMPILER} \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-${CXX_COMPILER} \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ @@ -47,10 +74,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make -j `nproc` -make install -j `nproc` +make VERBOSE=1 -j2 +make install -j2 -- GitLab