提交 8a4fad42 编写于 作者: L Liu Yiqun

Support to use clang for Android cross-compiling.

上级 5ca41184
...@@ -13,6 +13,10 @@ ...@@ -13,6 +13,10 @@
# system paths. # system paths.
# #
if(USE_EIGEN_FOR_BLAS)
return()
endif(USE_EIGEN_FOR_BLAS)
set(CBLAS_FOUND OFF) set(CBLAS_FOUND OFF)
## Find MKLML First. ## Find MKLML First.
......
...@@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ...@@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App
ELSE() ELSE()
SET(USE_OMP ON) SET(USE_OMP ON)
ENDIF() ENDIF()
SET(USE_OMP OFF FORCE)
ExternalProject_Add( ExternalProject_Add(
extern_warpctc extern_warpctc
......
...@@ -20,11 +20,11 @@ limitations under the License. */ ...@@ -20,11 +20,11 @@ limitations under the License. */
#include "paddle/math/MathFunctions.h" #include "paddle/math/MathFunctions.h"
#ifndef PADDLE_TYPE_DOUBLE // #ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float> // #define CBLAS_GEMM paddle::gemm<float>
#else // #else
#define CBLAS_GEMM paddle::gemm<double> // #define CBLAS_GEMM paddle::gemm<double>
#endif // #endif
template<class OpResetOutput> template<class OpResetOutput>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
...@@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput, ...@@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput,
hl_activation_mode_t active_node, hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) { hl_activation_mode_t active_gate) {
if (value.prevOutValue) { if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans, // CBLAS_GEMM(CblasNoTrans,
CblasNoTrans, // CblasNoTrans,
batchSize, // batchSize,
2 * frameSize, // 2 * frameSize,
frameSize, // frameSize,
1, // 1,
value.prevOutValue, // value.prevOutValue,
frameSize, // frameSize,
value.gateWeight, // value.gateWeight,
frameSize * 2, // frameSize * 2,
1, // 1,
value.gateValue, // value.gateValue,
frameSize * 3); // frameSize * 3);
} }
forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) { if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans, // CBLAS_GEMM(CblasNoTrans,
CblasNoTrans, // CblasNoTrans,
batchSize, // batchSize,
frameSize, // frameSize,
frameSize, // frameSize,
1, // 1,
value.resetOutputValue, // value.resetOutputValue,
frameSize, // frameSize,
value.stateWeight, // value.stateWeight,
frameSize, // frameSize,
1, // 1,
value.gateValue + frameSize * 2, // value.gateValue + frameSize * 2,
frameSize * 3); // frameSize * 3);
} }
forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
...@@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, ...@@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
frameSize, batchSize, active_node); frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) { if (value.prevOutValue && grad.prevOutGrad) {
CBLAS_GEMM(CblasNoTrans, // CBLAS_GEMM(CblasNoTrans,
CblasTrans, // CblasTrans,
batchSize, // batchSize,
frameSize, // frameSize,
frameSize, // frameSize,
1, // 1,
grad.gateGrad + frameSize * 2, // grad.gateGrad + frameSize * 2,
frameSize * 3, // frameSize * 3,
value.stateWeight, // value.stateWeight,
frameSize, // frameSize,
0, // 0,
grad.resetOutputGrad, // grad.resetOutputGrad,
frameSize); // frameSize);
if (grad.stateWeightGrad) { if (grad.stateWeightGrad) {
CBLAS_GEMM(CblasTrans, // CBLAS_GEMM(CblasTrans,
CblasNoTrans, // CblasNoTrans,
frameSize, // frameSize,
frameSize, // frameSize,
batchSize, // batchSize,
1, // 1,
value.resetOutputValue, // value.resetOutputValue,
frameSize, // frameSize,
grad.gateGrad + frameSize * 2, // grad.gateGrad + frameSize * 2,
frameSize * 3, // frameSize * 3,
1, // 1,
grad.stateWeightGrad, // grad.stateWeightGrad,
frameSize); // frameSize);
} }
} }
...@@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, ...@@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
frameSize, batchSize, active_gate); frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) { if (grad.prevOutGrad && value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans, // CBLAS_GEMM(CblasNoTrans,
CblasTrans, // CblasTrans,
batchSize, // batchSize,
frameSize, // frameSize,
frameSize * 2, // frameSize * 2,
1, // 1,
grad.gateGrad, // grad.gateGrad,
frameSize * 3, // frameSize * 3,
value.gateWeight, // value.gateWeight,
frameSize * 2, // frameSize * 2,
1, // 1,
grad.prevOutGrad, // grad.prevOutGrad,
frameSize); // frameSize);
if (grad.gateWeightGrad) { if (grad.gateWeightGrad) {
CBLAS_GEMM(CblasTrans, // CBLAS_GEMM(CblasTrans,
CblasNoTrans, // CblasNoTrans,
frameSize, // frameSize,
frameSize * 2, // frameSize * 2,
batchSize, // batchSize,
1, // 1,
value.prevOutValue, // value.prevOutValue,
frameSize, // frameSize,
grad.gateGrad, // grad.gateGrad,
frameSize * 3, // frameSize * 3,
1, // 1,
grad.gateWeightGrad, // grad.gateWeightGrad,
frameSize * 2); // frameSize * 2);
} }
} }
} }
......
...@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "MulOp.h" #include "MulOp.h"
/// todo(tianbing), delete it #include "GemmFunctor.h"
#include <iostream>
#include "paddle/math/MathFunctions.h"
#include "paddle/math/SIMDFunctions.h" #include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h" #include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace { namespace {
inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {
...@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out, ...@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
real scaleT, real scaleT,
bool aTrans, bool aTrans,
bool bTrans) { bool bTrans) {
GEMM(aTrans ? CblasTrans : CblasNoTrans, BlasGemm<DEVICE_TYPE_CPU, real>::compute(
bTrans ? CblasTrans : CblasNoTrans, aTrans,
out.getHeight(), bTrans,
out.getWidth(), out.getHeight(),
!aTrans ? a.getWidth() : a.getHeight(), out.getWidth(),
scaleAB, !aTrans ? a.getWidth() : a.getHeight(),
a.getData(), scaleAB,
a.getStride(), a.getData(),
b.getData(), a.getStride(),
b.getStride(), b.getData(),
scaleT, b.getStride(),
out.getData(), scaleT,
out.getStride()); out.getData(),
out.getStride());
} }
/// dense matrix (+)= sparse matrix * dense matrix /// dense matrix (+)= sparse matrix * dense matrix
......
...@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) ...@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
namespace paddle { namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <> template <>
void gemm<float>(const CBLAS_TRANSPOSE transA, void gemm<float>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
...@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA, ...@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
C, C,
ldc); ldc);
} }
#endif
template <> template <>
int getrf<float>(const CBLAS_ORDER order, int getrf<float>(const CBLAS_ORDER order,
...@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order, ...@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
} }
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <> template <>
void axpy<float>(const int n, const float alpha, const float* x, float* y) { void axpy<float>(const int n, const float alpha, const float* x, float* y) {
cblas_saxpy(n, alpha, x, 1, y, 1); cblas_saxpy(n, alpha, x, 1, y, 1);
...@@ -201,6 +204,7 @@ template <> ...@@ -201,6 +204,7 @@ template <>
double dotProduct<double>(const int n, const double* x, const double* y) { double dotProduct<double>(const int n, const double* x, const double* y) {
return cblas_ddot(n, x, 1, y, 1); return cblas_ddot(n, x, 1, y, 1);
} }
#endif
#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
......
...@@ -40,7 +40,14 @@ extern "C" { ...@@ -40,7 +40,14 @@ extern "C" {
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
#include <cblas.h> #include <cblas.h>
#else
typedef enum CBLAS_ORDER {
CblasRowMajor = 101,
CblasColMajor = 102
} CBLAS_ORDER;
#endif
int LAPACKE_sgetrf( int LAPACKE_sgetrf(
int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
int LAPACKE_dgetrf( int LAPACKE_dgetrf(
...@@ -56,6 +63,7 @@ int LAPACKE_dgetri( ...@@ -56,6 +63,7 @@ int LAPACKE_dgetri(
namespace paddle { namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <class T> template <class T>
void gemm(const CBLAS_TRANSPOSE transA, void gemm(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
...@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, ...@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
const T beta, const T beta,
T* C, T* C,
const int ldc); const int ldc);
#endif
template <class T> template <class T>
int getrf(const CBLAS_ORDER Order, int getrf(const CBLAS_ORDER Order,
...@@ -84,10 +93,20 @@ int getri( ...@@ -84,10 +93,20 @@ int getri(
const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
template <class T> template <class T>
void axpy(const int n, const T alpha, const T* x, T* y); void axpy(const int n, const T alpha, const T* x, T* y) {
/// y = y + alpha * x
for (int i = 0; i < n; i++) {
y[i] = y[i] + alpha * x[i];
}
}
template <class T> template <class T>
T dotProduct(const int n, const T* x, const T* y); T dotProduct(const int n, const T* x, const T* y) {
T result = static_cast<T>(0);
for (int i = 0; i < n; i++) {
result += x[i] * y[i];
}
}
template <class T> template <class T>
void vExp(const int n, const T* a, T* r); void vExp(const int n, const T* a, T* r);
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "hl_top_k.h" #include "hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/function/GemmFunctor.h"
#include "paddle/utils/ThreadLocal.h" #include "paddle/utils/ThreadLocal.h"
#include "SIMDFunctions.h" #include "SIMDFunctions.h"
...@@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { ...@@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
CHECK(!isTransposed()) << "Not supported"; CHECK(!isTransposed()) << "Not supported";
size_t a_col, b_col, a_row, b_row; size_t a_col, b_col, a_row, b_row;
CBLAS_TRANSPOSE a_trans, b_trans; // CBLAS_TRANSPOSE a_trans, b_trans;
bool a_trans, b_trans;
if (!a->isTransposed()) { if (!a->isTransposed()) {
a_col = a->getWidth(); a_col = a->getWidth();
a_row = a->getHeight(); a_row = a->getHeight();
a_trans = CblasNoTrans; // a_trans = CblasNoTrans;
a_trans = false;
} else { } else {
a_col = a->getHeight(); a_col = a->getHeight();
a_row = a->getWidth(); a_row = a->getWidth();
a_trans = CblasTrans; // a_trans = CblasTrans;
a_trans = true;
} }
if (!b->isTransposed()) { if (!b->isTransposed()) {
b_col = b->getWidth(); b_col = b->getWidth();
b_row = b->getHeight(); b_row = b->getHeight();
b_trans = CblasNoTrans; // b_trans = CblasNoTrans;
b_trans = false;
} else { } else {
b_col = b->getHeight(); b_col = b->getHeight();
b_row = b->getWidth(); b_row = b->getWidth();
b_trans = CblasTrans; // b_trans = CblasTrans;
b_trans = true;
} }
CHECK_EQ(a_col, b_row); CHECK_EQ(a_col, b_row);
...@@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { ...@@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride(); int lda = a->getStride();
int ldb = b->getStride(); int ldb = b->getStride();
int ldc = getStride(); int ldc = getStride();
gemm<real>( BlasGemm<DEVICE_TYPE_CPU, real>::compute(
a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
} }
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
set -xe set -xe
mkdir -p /paddle/build_android/$ANDROID_ABI rm -rf /paddle/build_android 2>/dev/null || true
cd /paddle/build_android/$ANDROID_ABI mkdir -p /paddle/build_android
rm -rf /paddle/install 2>/dev/null || true cd /paddle/build_android
THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI
...@@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then ...@@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_NEON=ON \ -DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \ -DANDROID_ARM_MODE=ON \
-DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \
-DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \
-DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=/paddle/install \ -DCMAKE_INSTALL_PREFIX=/paddle/install \
-DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
/paddle -DWITH_STYLE_CHECK=OFF \
elif [ $ANDROID_ABI == "arm64-v7a" ]; then ..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \ cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \ -DANDROID_ARM_MODE=ON \
-DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \
-DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \
-DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=/paddle/install \ -DCMAKE_INSTALL_PREFIX=/paddle/install \
...@@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then ...@@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
/paddle ..
elif [ $ANDROID_ABI == "armeabi" ]; then elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \ cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
...@@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then ...@@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
/paddle ..
else else
echo "Invalid ANDROID_ABI: $ANDROID_ABI" echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi fi
make -j `nproc` make VERBOSE=1
make install -j `nproc` make install
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册