提交 8a4fad42 编写于 作者: L Liu Yiqun

Support to use clang for Android cross-compiling.

上级 5ca41184
......@@ -13,6 +13,10 @@
# system paths.
#
if(USE_EIGEN_FOR_BLAS)
return()
endif(USE_EIGEN_FOR_BLAS)
set(CBLAS_FOUND OFF)
## Find MKLML First.
......
......@@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App
ELSE()
SET(USE_OMP ON)
ENDIF()
SET(USE_OMP OFF FORCE)
ExternalProject_Add(
extern_warpctc
......
......@@ -20,11 +20,11 @@ limitations under the License. */
#include "paddle/math/MathFunctions.h"
#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float>
#else
#define CBLAS_GEMM paddle::gemm<double>
#endif
// #ifndef PADDLE_TYPE_DOUBLE
// #define CBLAS_GEMM paddle::gemm<float>
// #else
// #define CBLAS_GEMM paddle::gemm<double>
// #endif
template<class OpResetOutput>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
......@@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
// CBLAS_GEMM(CblasNoTrans,
// CblasNoTrans,
// batchSize,
// 2 * frameSize,
// frameSize,
// 1,
// value.prevOutValue,
// frameSize,
// value.gateWeight,
// frameSize * 2,
// 1,
// value.gateValue,
// frameSize * 3);
}
forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
// CBLAS_GEMM(CblasNoTrans,
// CblasNoTrans,
// batchSize,
// frameSize,
// frameSize,
// 1,
// value.resetOutputValue,
// frameSize,
// value.stateWeight,
// frameSize,
// 1,
// value.gateValue + frameSize * 2,
// frameSize * 3);
}
forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
......@@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
// CBLAS_GEMM(CblasNoTrans,
// CblasTrans,
// batchSize,
// frameSize,
// frameSize,
// 1,
// grad.gateGrad + frameSize * 2,
// frameSize * 3,
// value.stateWeight,
// frameSize,
// 0,
// grad.resetOutputGrad,
// frameSize);
if (grad.stateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
// CBLAS_GEMM(CblasTrans,
// CblasNoTrans,
// frameSize,
// frameSize,
// batchSize,
// 1,
// value.resetOutputValue,
// frameSize,
// grad.gateGrad + frameSize * 2,
// frameSize * 3,
// 1,
// grad.stateWeightGrad,
// frameSize);
}
}
......@@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad,
frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
// CBLAS_GEMM(CblasNoTrans,
// CblasTrans,
// batchSize,
// frameSize,
// frameSize * 2,
// 1,
// grad.gateGrad,
// frameSize * 3,
// value.gateWeight,
// frameSize * 2,
// 1,
// grad.prevOutGrad,
// frameSize);
if (grad.gateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
// CBLAS_GEMM(CblasTrans,
// CblasNoTrans,
// frameSize,
// frameSize * 2,
// batchSize,
// 1,
// value.prevOutValue,
// frameSize,
// grad.gateGrad,
// frameSize * 3,
// 1,
// grad.gateWeightGrad,
// frameSize * 2);
}
}
}
......
......@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "MulOp.h"
/// todo(tianbing), delete it
#include <iostream>
#include "paddle/math/MathFunctions.h"
#include "GemmFunctor.h"
#include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace {
inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
......@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
real scaleT,
bool aTrans,
bool bTrans) {
GEMM(aTrans ? CblasTrans : CblasNoTrans,
bTrans ? CblasTrans : CblasNoTrans,
out.getHeight(),
out.getWidth(),
!aTrans ? a.getWidth() : a.getHeight(),
scaleAB,
a.getData(),
a.getStride(),
b.getData(),
b.getStride(),
scaleT,
out.getData(),
out.getStride());
BlasGemm<DEVICE_TYPE_CPU, real>::compute(
aTrans,
bTrans,
out.getHeight(),
out.getWidth(),
!aTrans ? a.getWidth() : a.getHeight(),
scaleAB,
a.getData(),
a.getStride(),
b.getData(),
b.getStride(),
scaleT,
out.getData(),
out.getStride());
}
/// dense matrix (+)= sparse matrix * dense matrix
......
......@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <>
void gemm<float>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
......@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
C,
ldc);
}
#endif
template <>
int getrf<float>(const CBLAS_ORDER order,
......@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
}
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <>
void axpy<float>(const int n, const float alpha, const float* x, float* y) {
cblas_saxpy(n, alpha, x, 1, y, 1);
......@@ -201,6 +204,7 @@ template <>
double dotProduct<double>(const int n, const double* x, const double* y) {
return cblas_ddot(n, x, 1, y, 1);
}
#endif
#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
......
......@@ -40,7 +40,14 @@ extern "C" {
#ifndef LAPACK_FOUND
extern "C" {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
#include <cblas.h>
#else
typedef enum CBLAS_ORDER {
CblasRowMajor = 101,
CblasColMajor = 102
} CBLAS_ORDER;
#endif
int LAPACKE_sgetrf(
int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
int LAPACKE_dgetrf(
......@@ -56,6 +63,7 @@ int LAPACKE_dgetri(
namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <class T>
void gemm(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
......@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
const T beta,
T* C,
const int ldc);
#endif
template <class T>
int getrf(const CBLAS_ORDER Order,
......@@ -84,10 +93,20 @@ int getri(
const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
template <class T>
void axpy(const int n, const T alpha, const T* x, T* y);
void axpy(const int n, const T alpha, const T* x, T* y) {
/// y = y + alpha * x
for (int i = 0; i < n; i++) {
y[i] = y[i] + alpha * x[i];
}
}
template <class T>
T dotProduct(const int n, const T* x, const T* y);
T dotProduct(const int n, const T* x, const T* y) {
T result = static_cast<T>(0);
for (int i = 0; i < n; i++) {
result += x[i] * y[i];
}
}
template <class T>
void vExp(const int n, const T* a, T* r);
......
......@@ -28,6 +28,7 @@ limitations under the License. */
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
#include "paddle/function/GemmFunctor.h"
#include "paddle/utils/ThreadLocal.h"
#include "SIMDFunctions.h"
......@@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
CHECK(!isTransposed()) << "Not supported";
size_t a_col, b_col, a_row, b_row;
CBLAS_TRANSPOSE a_trans, b_trans;
// CBLAS_TRANSPOSE a_trans, b_trans;
bool a_trans, b_trans;
if (!a->isTransposed()) {
a_col = a->getWidth();
a_row = a->getHeight();
a_trans = CblasNoTrans;
// a_trans = CblasNoTrans;
a_trans = false;
} else {
a_col = a->getHeight();
a_row = a->getWidth();
a_trans = CblasTrans;
// a_trans = CblasTrans;
a_trans = true;
}
if (!b->isTransposed()) {
b_col = b->getWidth();
b_row = b->getHeight();
b_trans = CblasNoTrans;
// b_trans = CblasNoTrans;
b_trans = false;
} else {
b_col = b->getHeight();
b_row = b->getWidth();
b_trans = CblasTrans;
// b_trans = CblasTrans;
b_trans = true;
}
CHECK_EQ(a_col, b_row);
......@@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride();
int ldb = b->getStride();
int ldc = getStride();
gemm<real>(
BlasGemm<DEVICE_TYPE_CPU, real>::compute(
a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
}
......
......@@ -2,9 +2,9 @@
set -xe
mkdir -p /paddle/build_android/$ANDROID_ABI
cd /paddle/build_android/$ANDROID_ABI
rm -rf /paddle/install 2>/dev/null || true
rm -rf /paddle/build_android 2>/dev/null || true
mkdir -p /paddle/build_android
cd /paddle/build_android
THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI
......@@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \
-DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \
-DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=/paddle/install \
-DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
/paddle
elif [ $ANDROID_ABI == "arm64-v7a" ]; then
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \
-DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=/paddle/install \
......@@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then
-DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
/paddle
..
elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
......@@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
-DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
/paddle
..
else
echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi
make -j `nproc`
make install -j `nproc`
make VERBOSE=1
make install
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册