Support to use clang for Android cross-compiling.

8a4fad42 · Liu Yiqun · 5ca41184 · 8a4fad42 · 8a4fad42 · 8a4fad42
8 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,6 +13,10 @@
 # system paths.
 #

+if(USE_EIGEN_FOR_BLAS)
+  return()
+endif(USE_EIGEN_FOR_BLAS)
+
 set(CBLAS_FOUND OFF)

 ## Find MKLML First.

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App
 ELSE()
    SET(USE_OMP ON)
 ENDIF()
+SET(USE_OMP OFF FORCE)

 ExternalProject_Add(
    extern_warpctc

--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,11 +20,11 @@ limitations under the License. */

 #include "paddle/math/MathFunctions.h"

-#ifndef PADDLE_TYPE_DOUBLE
-#define     CBLAS_GEMM     paddle::gemm<float>
-#else
-#define     CBLAS_GEMM     paddle::gemm<double>
-#endif
+// #ifndef PADDLE_TYPE_DOUBLE
+// #define     CBLAS_GEMM     paddle::gemm<float>
+// #else
+// #define     CBLAS_GEMM     paddle::gemm<double>
+// #endif

 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
@@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput,
                        hl_activation_mode_t active_node,
                        hl_activation_mode_t active_gate) {
  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               2 * frameSize,
-               frameSize,
-               1,
-               value.prevOutValue,
-               frameSize,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               value.gateValue,
-               frameSize * 3);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasNoTrans,
+//                batchSize,
+//                2 * frameSize,
+//                frameSize,
+//                1,
+//                value.prevOutValue,
+//                frameSize,
+//                value.gateWeight,
+//                frameSize * 2,
+//                1,
+//                value.gateValue,
+//                frameSize * 3);
  }

  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);

  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               value.resetOutputValue,
-               frameSize,
-               value.stateWeight,
-               frameSize,
-               1,
-               value.gateValue + frameSize * 2,
-               frameSize * 3);
+//    CBLAS_GEMM(CblasNoTrans,
+//               CblasNoTrans,
+//               batchSize,
+//               frameSize,
+//               frameSize,
+//               1,
+//               value.resetOutputValue,
+//               frameSize,
+//               value.stateWeight,
+//               frameSize,
+//               1,
+//               value.gateValue + frameSize * 2,
+//               frameSize * 3);
  }

  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
@@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
    frameSize, batchSize, active_node);

  if (value.prevOutValue && grad.prevOutGrad) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               grad.gateGrad + frameSize * 2,
-               frameSize * 3,
-               value.stateWeight,
-               frameSize,
-               0,
-               grad.resetOutputGrad,
-               frameSize);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasTrans,
+//                batchSize,
+//                frameSize,
+//                frameSize,
+//                1,
+//                grad.gateGrad + frameSize * 2,
+//                frameSize * 3,
+//                value.stateWeight,
+//                frameSize,
+//                0,
+//                grad.resetOutputGrad,
+//                frameSize);

    if (grad.stateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize,
-                 batchSize,
-                 1,
-                 value.resetOutputValue,
-                 frameSize,
-                 grad.gateGrad + frameSize * 2,
-                 frameSize * 3,
-                 1,
-                 grad.stateWeightGrad,
-                 frameSize);
+//       CBLAS_GEMM(CblasTrans,
+//                  CblasNoTrans,
+//                  frameSize,
+//                  frameSize,
+//                  batchSize,
+//                  1,
+//                  value.resetOutputValue,
+//                  frameSize,
+//                  grad.gateGrad + frameSize * 2,
+//                  frameSize * 3,
+//                  1,
+//                  grad.stateWeightGrad,
+//                  frameSize);
    }
  }

@@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
    frameSize, batchSize, active_gate);

  if (grad.prevOutGrad && value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize * 2,
-               1,
-               grad.gateGrad,
-               frameSize * 3,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               grad.prevOutGrad,
-               frameSize);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasTrans,
+//                batchSize,
+//                frameSize,
+//                frameSize * 2,
+//                1,
+//                grad.gateGrad,
+//                frameSize * 3,
+//                value.gateWeight,
+//                frameSize * 2,
+//                1,
+//                grad.prevOutGrad,
+//                frameSize);

    if (grad.gateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize * 2,
-                 batchSize,
-                 1,
-                 value.prevOutValue,
-                 frameSize,
-                 grad.gateGrad,
-                 frameSize * 3,
-                 1,
-                 grad.gateWeightGrad,
-                 frameSize * 2);
+//       CBLAS_GEMM(CblasTrans,
+//                  CblasNoTrans,
+//                  frameSize,
+//                  frameSize * 2,
+//                  batchSize,
+//                  1,
+//                  value.prevOutValue,
+//                  frameSize,
+//                  grad.gateGrad,
+//                  frameSize * 3,
+//                  1,
+//                  grad.gateWeightGrad,
+//                  frameSize * 2);
    }
  }
 }

--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "MulOp.h"
-/// todo(tianbing), delete it
-#include <iostream>
-#include "paddle/math/MathFunctions.h"
+#include "GemmFunctor.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"

-#ifndef PADDLE_TYPE_DOUBLE
-#define GEMM paddle::gemm<float>
-#else
-#define GEMM paddle::gemm<double>
-#endif
-
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
  for (unsigned int i = 0; i < len; ++i) {
@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                            real scaleT,
                            bool aTrans,
                            bool bTrans) {
-  GEMM(aTrans ? CblasTrans : CblasNoTrans,
-       bTrans ? CblasTrans : CblasNoTrans,
-       out.getHeight(),
-       out.getWidth(),
-       !aTrans ? a.getWidth() : a.getHeight(),
-       scaleAB,
-       a.getData(),
-       a.getStride(),
-       b.getData(),
-       b.getStride(),
-       scaleT,
-       out.getData(),
-       out.getStride());
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
 }

 /// dense matrix (+)= sparse matrix * dense matrix

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)

 namespace paddle {

+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void gemm<float>(const CBLAS_TRANSPOSE transA,
                 const CBLAS_TRANSPOSE transB,
@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
              C,
              ldc);
 }
+#endif

 template <>
 int getrf<float>(const CBLAS_ORDER order,
@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
 }

+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
  cblas_saxpy(n, alpha, x, 1, y, 1);
@@ -201,6 +204,7 @@ template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
 }
+#endif

 #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)


--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -40,7 +40,14 @@ extern "C" {

 #ifndef LAPACK_FOUND
 extern "C" {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 #include <cblas.h>
+#else
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+#endif
 int LAPACKE_sgetrf(
    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
 int LAPACKE_dgetrf(
@@ -56,6 +63,7 @@ int LAPACKE_dgetri(

 namespace paddle {

+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <class T>
 void gemm(const CBLAS_TRANSPOSE transA,
          const CBLAS_TRANSPOSE transB,
@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
          const T beta,
          T* C,
          const int ldc);
+#endif

 template <class T>
 int getrf(const CBLAS_ORDER Order,
@@ -84,10 +93,20 @@ int getri(
    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);

 template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y);
+void axpy(const int n, const T alpha, const T* x, T* y) {
+  /// y = y + alpha * x
+  for (int i = 0; i < n; i++) {
+    y[i] = y[i] + alpha * x[i];
+  }
+}

 template <class T>
-T dotProduct(const int n, const T* x, const T* y);
+T dotProduct(const int n, const T* x, const T* y) {
+  T result = static_cast<T>(0);
+  for (int i = 0; i < n; i++) {
+    result += x[i] * y[i];
+  }
+}

 template <class T>
 void vExp(const int n, const T* a, T* r);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"

+#include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"

 #include "SIMDFunctions.h"
@@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
  CHECK(!isTransposed()) << "Not supported";

  size_t a_col, b_col, a_row, b_row;
-  CBLAS_TRANSPOSE a_trans, b_trans;
+  // CBLAS_TRANSPOSE a_trans, b_trans;
+  bool a_trans, b_trans;
  if (!a->isTransposed()) {
    a_col = a->getWidth();
    a_row = a->getHeight();
-    a_trans = CblasNoTrans;
+    // a_trans = CblasNoTrans;
+    a_trans = false;
  } else {
    a_col = a->getHeight();
    a_row = a->getWidth();
-    a_trans = CblasTrans;
+    // a_trans = CblasTrans;
+    a_trans = true;
  }
  if (!b->isTransposed()) {
    b_col = b->getWidth();
    b_row = b->getHeight();
-    b_trans = CblasNoTrans;
+    // b_trans = CblasNoTrans;
+    b_trans = false;
  } else {
    b_col = b->getHeight();
    b_row = b->getWidth();
-    b_trans = CblasTrans;
+    // b_trans = CblasTrans;
+    b_trans = true;
  }

  CHECK_EQ(a_col, b_row);
@@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
  int lda = a->getStride();
  int ldb = b->getStride();
  int ldc = getStride();
-  gemm<real>(
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }


--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@

 set -xe

-mkdir -p /paddle/build_android/$ANDROID_ABI
-cd /paddle/build_android/$ANDROID_ABI
-rm -rf /paddle/install 2>/dev/null || true
+rm -rf /paddle/build_android 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android

 THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI

@@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
        -DANDROID_ABI=$ANDROID_ABI \
        -DANDROID_ARM_NEON=ON \
        -DANDROID_ARM_MODE=ON \
+        -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \
+        -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \
        -DHOST_C_COMPILER=/usr/bin/gcc \
        -DHOST_CXX_COMPILER=/usr/bin/g++ \
        -DCMAKE_INSTALL_PREFIX=/paddle/install \
        -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \
        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_EIGEN_FOR_BLAS=ON \
        -DWITH_C_API=ON \
        -DWITH_SWIG_PY=OFF \
-        /paddle
-elif [ $ANDROID_ABI == "arm64-v7a" ]; then
+        -DWITH_STYLE_CHECK=OFF \
+        ..
+elif [ $ANDROID_ABI == "arm64-v8a" ]; then
  cmake -DCMAKE_SYSTEM_NAME=Android \
        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \
        -DANDROID_ABI=$ANDROID_ABI \
        -DANDROID_ARM_MODE=ON \
+        -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \
+        -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \
        -DHOST_C_COMPILER=/usr/bin/gcc \
        -DHOST_CXX_COMPILER=/usr/bin/g++ \
        -DCMAKE_INSTALL_PREFIX=/paddle/install \
@@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then
        -DCMAKE_BUILD_TYPE=Release \
        -DWITH_C_API=ON \
        -DWITH_SWIG_PY=OFF \
-        /paddle
+        ..
 elif [ $ANDROID_ABI == "armeabi" ]; then
  cmake -DCMAKE_SYSTEM_NAME=Android \
        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
@@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
        -DCMAKE_BUILD_TYPE=Release \
        -DWITH_C_API=ON \
        -DWITH_SWIG_PY=OFF \
-        /paddle
+        ..
 else
  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
 fi

-make -j `nproc`
-make install -j `nproc`
+make VERBOSE=1
+make install