提交 a916c525 编写于 作者: T tensor-tang

refine gemm

上级 961e754c
...@@ -37,6 +37,7 @@ struct CBlas<float> { ...@@ -37,6 +37,7 @@ struct CBlas<float> {
libxsmm_sgemm(args...); libxsmm_sgemm(args...);
} }
#endif #endif
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
platform::dynload::cblas_saxpy(args...); platform::dynload::cblas_saxpy(args...);
...@@ -76,6 +77,7 @@ struct CBlas<double> { ...@@ -76,6 +77,7 @@ struct CBlas<double> {
libxsmm_dgemm(args...); libxsmm_dgemm(args...);
} }
#endif #endif
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
platform::dynload::cblas_daxpy(args...); platform::dynload::cblas_daxpy(args...);
...@@ -150,6 +152,7 @@ struct CBlas<double> { ...@@ -150,6 +152,7 @@ struct CBlas<double> {
} }
}; };
#endif #endif
template <> template <>
struct CBlas<platform::float16> { struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
...@@ -190,45 +193,48 @@ inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k, ...@@ -190,45 +193,48 @@ inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
return false; return false;
} }
template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA, inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
int N, int K, T alpha, const T *A, const T *A, int lda, const T *B, int ldb, T beta, T *C,
const T *B, T beta, T *C) const { int ldc) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
#ifdef PADDLE_WITH_LIBXSMM #ifdef PADDLE_WITH_LIBXSMM
if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) { beta)) {
// Note: SMM use ColMajor // Note: SMM use ColMajor
const char transa = 'N'; const char transa = 'N';
const char transb = 'N'; const char transb = 'N';
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
&beta, C, &ldc); &beta, C, &ldc);
} else { return;
}
#endif #endif
#ifdef PADDLE_MKL_SPLIT_GEMM #ifdef PADDLE_MKL_SPLIT_GEMM
constexpr int bs = 2; constexpr int bs = 2;
if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) { if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
for (int off = 0; off < M; off += bs) { for (int off = 0; off < M; off += bs) {
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, off, N, K, CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
alpha, A + off * lda, lda, B, ldb, beta, C + off * ldb, A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
ldc);
}
} else {
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
ldb, beta, C, ldc);
#ifdef PADDLE_MKL_SPLIT_GEMM
} }
#endif return;
#ifdef PADDLE_WITH_LIBXSMM
} }
#endif #endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
const T *B, T beta, T *C) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
} }
template <> template <>
...@@ -237,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M, ...@@ -237,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A, int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb, int lda, const T *B, int ldb,
T beta, T *C, int ldc) const { T beta, T *C, int ldc) const {
CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc); lda, B, ldb, beta, C, ldc);
} }
template <typename DeviceContext> template <typename DeviceContext>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册