From 8a4fad4248e942061586538e8de14a7d08052330 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 23 Aug 2017 19:43:57 +0800 Subject: [PATCH] Support to use clang for Android cross-compiling. --- cmake/cblas.cmake | 4 + cmake/external/warpctc.cmake | 1 + paddle/cuda/include/hl_cpu_gru.cuh | 166 ++++++++++++------------- paddle/function/MulOp.cpp | 37 +++--- paddle/math/MathFunctions.cpp | 4 + paddle/math/MathFunctions.h | 23 +++- paddle/math/Matrix.cpp | 18 ++- paddle/scripts/docker/build_android.sh | 24 ++-- 8 files changed, 155 insertions(+), 122 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 854066fd1d..ab111eccc0 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -13,6 +13,10 @@ # system paths. # +if(USE_EIGEN_FOR_BLAS) + return() +endif(USE_EIGEN_FOR_BLAS) + set(CBLAS_FOUND OFF) ## Find MKLML First. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 2d7daed9bc..3cc652bed5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ELSE() SET(USE_OMP ON) ENDIF() +SET(USE_OMP OFF FORCE) ExternalProject_Add( extern_warpctc diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index c0a37ced2a..732799a28b 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/math/MathFunctions.h" -#ifndef PADDLE_TYPE_DOUBLE -#define CBLAS_GEMM paddle::gemm -#else -#define CBLAS_GEMM paddle::gemm -#endif +// #ifndef PADDLE_TYPE_DOUBLE +// #define CBLAS_GEMM paddle::gemm +// #else +// #define CBLAS_GEMM paddle::gemm +// #endif template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, @@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput, hl_activation_mode_t active_node, hl_activation_mode_t active_gate) { if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - 2 * frameSize, - frameSize, - 1, - value.prevOutValue, - frameSize, - value.gateWeight, - frameSize * 2, - 1, - value.gateValue, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// 2 * frameSize, +// frameSize, +// 1, +// value.prevOutValue, +// frameSize, +// value.gateWeight, +// frameSize * 2, +// 1, +// value.gateValue, +// frameSize * 3); } forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - frameSize, - frameSize, - 1, - value.resetOutputValue, - frameSize, - value.stateWeight, - frameSize, - 1, - value.gateValue + frameSize * 2, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// value.resetOutputValue, +// frameSize, +// value.stateWeight, +// frameSize, +// 1, +// value.gateValue + frameSize * 2, +// frameSize * 3); } forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); @@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_node); if (value.prevOutValue && grad.prevOutGrad) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize, - 1, - grad.gateGrad + frameSize * 2, - frameSize * 3, - value.stateWeight, - frameSize, - 0, - grad.resetOutputGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// value.stateWeight, +// frameSize, +// 0, +// grad.resetOutputGrad, +// frameSize); if (grad.stateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize, - batchSize, - 1, - value.resetOutputValue, - frameSize, - grad.gateGrad + frameSize * 2, - frameSize * 3, - 1, - grad.stateWeightGrad, - frameSize); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize, +// batchSize, +// 1, +// value.resetOutputValue, +// frameSize, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// 1, +// grad.stateWeightGrad, +// frameSize); } } @@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_gate); if (grad.prevOutGrad && value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize * 2, - 1, - grad.gateGrad, - frameSize * 3, - value.gateWeight, - frameSize * 2, - 1, - grad.prevOutGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize * 2, +// 1, +// grad.gateGrad, +// frameSize * 3, +// value.gateWeight, +// frameSize * 2, +// 1, +// grad.prevOutGrad, +// frameSize); if (grad.gateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize * 2, - batchSize, - 1, - value.prevOutValue, - frameSize, - grad.gateGrad, - frameSize * 3, - 1, - grad.gateWeightGrad, - frameSize * 2); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize * 2, +// batchSize, +// 1, +// value.prevOutValue, +// frameSize, +// grad.gateGrad, +// frameSize * 3, +// 1, +// grad.gateWeightGrad, +// frameSize * 2); } } } diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 91b4b8ed91..25e41edad5 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -13,18 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MulOp.h" -/// todo(tianbing), delete it -#include -#include "paddle/math/MathFunctions.h" +#include "GemmFunctor.h" #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/ThreadLocal.h" -#ifndef PADDLE_TYPE_DOUBLE -#define GEMM paddle::gemm -#else -#define GEMM paddle::gemm -#endif - namespace { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { for (unsigned int i = 0; i < len; ++i) { @@ -114,19 +106,20 @@ void MulOp(CpuMatrix& out, real scaleT, bool aTrans, bool bTrans) { - GEMM(aTrans ? CblasTrans : CblasNoTrans, - bTrans ? CblasTrans : CblasNoTrans, - out.getHeight(), - out.getWidth(), - !aTrans ? a.getWidth() : a.getHeight(), - scaleAB, - a.getData(), - a.getStride(), - b.getData(), - b.getStride(), - scaleT, - out.getData(), - out.getStride()); + BlasGemm::compute( + aTrans, + bTrans, + out.getHeight(), + out.getWidth(), + !aTrans ? a.getWidth() : a.getHeight(), + scaleAB, + a.getData(), + a.getStride(), + b.getData(), + b.getStride(), + scaleT, + out.getData(), + out.getStride()); } /// dense matrix (+)= sparse matrix * dense matrix diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index c8ba1074a1..c2f17beeb8 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -143,6 +144,7 @@ void gemm(const CBLAS_TRANSPOSE transA, C, ldc); } +#endif template <> int getrf(const CBLAS_ORDER order, @@ -182,6 +184,7 @@ int getri(const CBLAS_ORDER order, return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); } +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); @@ -201,6 +204,7 @@ template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } +#endif #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 637643838f..9297ae78c2 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -40,7 +40,14 @@ extern "C" { #ifndef LAPACK_FOUND extern "C" { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS #include +#else +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; +#endif int LAPACKE_sgetrf( int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int LAPACKE_dgetrf( @@ -56,6 +63,7 @@ int LAPACKE_dgetri( namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const T beta, T* C, const int ldc); +#endif template int getrf(const CBLAS_ORDER Order, @@ -84,10 +93,20 @@ int getri( const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); template -void axpy(const int n, const T alpha, const T* x, T* y); +void axpy(const int n, const T alpha, const T* x, T* y) { + /// y = y + alpha * x + for (int i = 0; i < n; i++) { + y[i] = y[i] + alpha * x[i]; + } +} template -T dotProduct(const int n, const T* x, const T* y); +T dotProduct(const int n, const T* x, const T* y) { + T result = static_cast(0); + for (int i = 0; i < n; i++) { + result += x[i] * y[i]; + } +} template void vExp(const int n, const T* a, T* r); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 27f7d95b75..fbf3accc9a 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" #include "SIMDFunctions.h" @@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - CBLAS_TRANSPOSE a_trans, b_trans; + // CBLAS_TRANSPOSE a_trans, b_trans; + bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - a_trans = CblasNoTrans; + // a_trans = CblasNoTrans; + a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - a_trans = CblasTrans; + // a_trans = CblasTrans; + a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - b_trans = CblasNoTrans; + // b_trans = CblasNoTrans; + b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - b_trans = CblasTrans; + // b_trans = CblasTrans; + b_trans = true; } CHECK_EQ(a_col, b_row); @@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); - gemm( + BlasGemm::compute( a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 593ae28e49..79f5ab12e9 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,9 +2,9 @@ set -xe -mkdir -p /paddle/build_android/$ANDROID_ABI -cd /paddle/build_android/$ANDROID_ABI -rm -rf /paddle/install 2>/dev/null || true +rm -rf /paddle/build_android 2>/dev/null || true +mkdir -p /paddle/build_android +cd /paddle/build_android THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI @@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=/paddle/install \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle -elif [ $ANDROID_ABI == "arm64-v7a" ]; then + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=/paddle/install \ @@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ @@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make -j `nproc` -make install -j `nproc` +make VERBOSE=1 +make install -- GitLab