From 8a4fad4248e942061586538e8de14a7d08052330 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 23 Aug 2017 19:43:57 +0800
Subject: [PATCH] Support to use clang for Android cross-compiling.

---
 cmake/cblas.cmake                      |   4 +
 cmake/external/warpctc.cmake           |   1 +
 paddle/cuda/include/hl_cpu_gru.cuh     | 166 ++++++++++++-------------
 paddle/function/MulOp.cpp              |  37 +++---
 paddle/math/MathFunctions.cpp          |   4 +
 paddle/math/MathFunctions.h            |  23 +++-
 paddle/math/Matrix.cpp                 |  18 ++-
 paddle/scripts/docker/build_android.sh |  24 ++--
 8 files changed, 155 insertions(+), 122 deletions(-)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 854066fd1d2..ab111eccc09 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,6 +13,10 @@
 # system paths.
 #
 
+if(USE_EIGEN_FOR_BLAS)
+  return()
+endif(USE_EIGEN_FOR_BLAS)
+
 set(CBLAS_FOUND OFF)
 
 ## Find MKLML First.
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 2d7daed9bcd..3cc652bed5e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App
 ELSE()
     SET(USE_OMP ON)
 ENDIF()
+SET(USE_OMP OFF FORCE)
 
 ExternalProject_Add(
     extern_warpctc
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
index c0a37ced2a7..732799a28b2 100644
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,11 +20,11 @@ limitations under the License. */
 
 #include "paddle/math/MathFunctions.h"
 
-#ifndef PADDLE_TYPE_DOUBLE
-#define     CBLAS_GEMM     paddle::gemm<float>
-#else
-#define     CBLAS_GEMM     paddle::gemm<double>
-#endif
+// #ifndef PADDLE_TYPE_DOUBLE
+// #define     CBLAS_GEMM     paddle::gemm<float>
+// #else
+// #define     CBLAS_GEMM     paddle::gemm<double>
+// #endif
 
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
@@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput,
                         hl_activation_mode_t active_node,
                         hl_activation_mode_t active_gate) {
   if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               2 * frameSize,
-               frameSize,
-               1,
-               value.prevOutValue,
-               frameSize,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               value.gateValue,
-               frameSize * 3);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasNoTrans,
+//                batchSize,
+//                2 * frameSize,
+//                frameSize,
+//                1,
+//                value.prevOutValue,
+//                frameSize,
+//                value.gateWeight,
+//                frameSize * 2,
+//                1,
+//                value.gateValue,
+//                frameSize * 3);
   }
 
   forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
 
   if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               value.resetOutputValue,
-               frameSize,
-               value.stateWeight,
-               frameSize,
-               1,
-               value.gateValue + frameSize * 2,
-               frameSize * 3);
+//    CBLAS_GEMM(CblasNoTrans,
+//               CblasNoTrans,
+//               batchSize,
+//               frameSize,
+//               frameSize,
+//               1,
+//               value.resetOutputValue,
+//               frameSize,
+//               value.stateWeight,
+//               frameSize,
+//               1,
+//               value.gateValue + frameSize * 2,
+//               frameSize * 3);
   }
 
   forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
@@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
     frameSize, batchSize, active_node);
 
   if (value.prevOutValue && grad.prevOutGrad) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               grad.gateGrad + frameSize * 2,
-               frameSize * 3,
-               value.stateWeight,
-               frameSize,
-               0,
-               grad.resetOutputGrad,
-               frameSize);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasTrans,
+//                batchSize,
+//                frameSize,
+//                frameSize,
+//                1,
+//                grad.gateGrad + frameSize * 2,
+//                frameSize * 3,
+//                value.stateWeight,
+//                frameSize,
+//                0,
+//                grad.resetOutputGrad,
+//                frameSize);
 
     if (grad.stateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize,
-                 batchSize,
-                 1,
-                 value.resetOutputValue,
-                 frameSize,
-                 grad.gateGrad + frameSize * 2,
-                 frameSize * 3,
-                 1,
-                 grad.stateWeightGrad,
-                 frameSize);
+//       CBLAS_GEMM(CblasTrans,
+//                  CblasNoTrans,
+//                  frameSize,
+//                  frameSize,
+//                  batchSize,
+//                  1,
+//                  value.resetOutputValue,
+//                  frameSize,
+//                  grad.gateGrad + frameSize * 2,
+//                  frameSize * 3,
+//                  1,
+//                  grad.stateWeightGrad,
+//                  frameSize);
     }
   }
 
@@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
     frameSize, batchSize, active_gate);
 
   if (grad.prevOutGrad && value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize * 2,
-               1,
-               grad.gateGrad,
-               frameSize * 3,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               grad.prevOutGrad,
-               frameSize);
+//     CBLAS_GEMM(CblasNoTrans,
+//                CblasTrans,
+//                batchSize,
+//                frameSize,
+//                frameSize * 2,
+//                1,
+//                grad.gateGrad,
+//                frameSize * 3,
+//                value.gateWeight,
+//                frameSize * 2,
+//                1,
+//                grad.prevOutGrad,
+//                frameSize);
 
     if (grad.gateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize * 2,
-                 batchSize,
-                 1,
-                 value.prevOutValue,
-                 frameSize,
-                 grad.gateGrad,
-                 frameSize * 3,
-                 1,
-                 grad.gateWeightGrad,
-                 frameSize * 2);
+//       CBLAS_GEMM(CblasTrans,
+//                  CblasNoTrans,
+//                  frameSize,
+//                  frameSize * 2,
+//                  batchSize,
+//                  1,
+//                  value.prevOutValue,
+//                  frameSize,
+//                  grad.gateGrad,
+//                  frameSize * 3,
+//                  1,
+//                  grad.gateWeightGrad,
+//                  frameSize * 2);
     }
   }
 }
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 91b4b8ed91b..25e41edad54 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MulOp.h"
-/// todo(tianbing), delete it
-#include <iostream>
-#include "paddle/math/MathFunctions.h"
+#include "GemmFunctor.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"
 
-#ifndef PADDLE_TYPE_DOUBLE
-#define GEMM paddle::gemm<float>
-#else
-#define GEMM paddle::gemm<double>
-#endif
-
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                             real scaleT,
                             bool aTrans,
                             bool bTrans) {
-  GEMM(aTrans ? CblasTrans : CblasNoTrans,
-       bTrans ? CblasTrans : CblasNoTrans,
-       out.getHeight(),
-       out.getWidth(),
-       !aTrans ? a.getWidth() : a.getHeight(),
-       scaleAB,
-       a.getData(),
-       a.getStride(),
-       b.getData(),
-       b.getStride(),
-       scaleT,
-       out.getData(),
-       out.getStride());
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
 }
 
 /// dense matrix (+)= sparse matrix * dense matrix
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index c8ba1074a15..c2f17beeb87 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
 
 namespace paddle {
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void gemm<float>(const CBLAS_TRANSPOSE transA,
                  const CBLAS_TRANSPOSE transB,
@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
               C,
               ldc);
 }
+#endif
 
 template <>
 int getrf<float>(const CBLAS_ORDER order,
@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
   return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
 }
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
@@ -201,6 +204,7 @@ template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
+#endif
 
 #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 637643838ff..9297ae78c27 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -40,7 +40,14 @@ extern "C" {
 
 #ifndef LAPACK_FOUND
 extern "C" {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 #include <cblas.h>
+#else
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+#endif
 int LAPACKE_sgetrf(
     int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
 int LAPACKE_dgetrf(
@@ -56,6 +63,7 @@ int LAPACKE_dgetri(
 
 namespace paddle {
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <class T>
 void gemm(const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB,
@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
           const T beta,
           T* C,
           const int ldc);
+#endif
 
 template <class T>
 int getrf(const CBLAS_ORDER Order,
@@ -84,10 +93,20 @@ int getri(
     const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
 
 template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y);
+void axpy(const int n, const T alpha, const T* x, T* y) {
+  /// y = y + alpha * x
+  for (int i = 0; i < n; i++) {
+    y[i] = y[i] + alpha * x[i];
+  }
+}
 
 template <class T>
-T dotProduct(const int n, const T* x, const T* y);
+T dotProduct(const int n, const T* x, const T* y) {
+  T result = static_cast<T>(0);
+  for (int i = 0; i < n; i++) {
+    result += x[i] * y[i];
+  }
+}
 
 template <class T>
 void vExp(const int n, const T* a, T* r);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 27f7d95b752..fbf3accc9a5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
+#include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 
 #include "SIMDFunctions.h"
@@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
   size_t a_col, b_col, a_row, b_row;
-  CBLAS_TRANSPOSE a_trans, b_trans;
+  // CBLAS_TRANSPOSE a_trans, b_trans;
+  bool a_trans, b_trans;
   if (!a->isTransposed()) {
     a_col = a->getWidth();
     a_row = a->getHeight();
-    a_trans = CblasNoTrans;
+    // a_trans = CblasNoTrans;
+    a_trans = false;
   } else {
     a_col = a->getHeight();
     a_row = a->getWidth();
-    a_trans = CblasTrans;
+    // a_trans = CblasTrans;
+    a_trans = true;
   }
   if (!b->isTransposed()) {
     b_col = b->getWidth();
     b_row = b->getHeight();
-    b_trans = CblasNoTrans;
+    // b_trans = CblasNoTrans;
+    b_trans = false;
   } else {
     b_col = b->getHeight();
     b_row = b->getWidth();
-    b_trans = CblasTrans;
+    // b_trans = CblasTrans;
+    b_trans = true;
   }
 
   CHECK_EQ(a_col, b_row);
@@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int lda = a->getStride();
   int ldb = b->getStride();
   int ldc = getStride();
-  gemm<real>(
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
       a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }
 
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index 593ae28e495..79f5ab12e98 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 
 set -xe
 
-mkdir -p /paddle/build_android/$ANDROID_ABI
-cd /paddle/build_android/$ANDROID_ABI
-rm -rf /paddle/install 2>/dev/null || true
+rm -rf /paddle/build_android 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android
 
 THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI
 
@@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
         -DANDROID_ABI=$ANDROID_ABI \
         -DANDROID_ARM_NEON=ON \
         -DANDROID_ARM_MODE=ON \
+        -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \
+        -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \
         -DHOST_C_COMPILER=/usr/bin/gcc \
         -DHOST_CXX_COMPILER=/usr/bin/g++ \
         -DCMAKE_INSTALL_PREFIX=/paddle/install \
         -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \
         -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_EIGEN_FOR_BLAS=ON \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
-        /paddle
-elif [ $ANDROID_ABI == "arm64-v7a" ]; then
+        -DWITH_STYLE_CHECK=OFF \
+        ..
+elif [ $ANDROID_ABI == "arm64-v8a" ]; then
   cmake -DCMAKE_SYSTEM_NAME=Android \
         -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \
         -DANDROID_ABI=$ANDROID_ABI \
         -DANDROID_ARM_MODE=ON \
+        -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \
+        -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \
         -DHOST_C_COMPILER=/usr/bin/gcc \
         -DHOST_CXX_COMPILER=/usr/bin/g++ \
         -DCMAKE_INSTALL_PREFIX=/paddle/install \
@@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
-        /paddle
+        ..
 elif [ $ANDROID_ABI == "armeabi" ]; then
   cmake -DCMAKE_SYSTEM_NAME=Android \
         -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
@@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
-        /paddle
+        ..
 else
   echo "Invalid ANDROID_ABI: $ANDROID_ABI"
 fi
 
-make -j `nproc`
-make install -j `nproc`
+make VERBOSE=1
+make install
-- 
GitLab