diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ab80987b3ad6c4793ceeac1bf3808d2e87fbd5b..231224f9249848b6e4981a98e0538794bf5d3c08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,12 @@ else() set(THIRD_PARTY_BUILD_TYPE Release) endif() +if(WITH_MKL) + option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF) + if (MKL_SPLIT_GEMM) + add_definitions(-DPADDLE_MKL_SPLIT_GEMM) + endif() +endif() set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) if (WITH_MKL AND AVX2_FOUND) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 238bd3f8def9eaa6c18afdab1031c4babfde8ae2..a0802ef90ca7e30a2b22d187cb9092163518d8e9 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -37,6 +37,7 @@ struct CBlas { libxsmm_sgemm(args...); } #endif + template static void AXPY(ARGS... args) { platform::dynload::cblas_saxpy(args...); @@ -76,6 +77,7 @@ struct CBlas { libxsmm_dgemm(args...); } #endif + template static void AXPY(ARGS... args) { platform::dynload::cblas_daxpy(args...); @@ -150,6 +152,7 @@ struct CBlas { } }; #endif + template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } @@ -190,30 +193,48 @@ inline bool UseXSMM(const int &m, const int &n, const int &k, return false; } -template <> template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, int M, - int N, int K, T alpha, const T *A, - const T *B, T beta, T *C) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; +inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { #ifdef PADDLE_WITH_LIBXSMM - if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, - beta)) { + if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, + beta)) { // Note: SMM use ColMajor const char transa = 'N'; const char transb = 'N'; CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); - } else { + return; + } #endif - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, - ldb, beta, C, ldc); -#ifdef PADDLE_WITH_LIBXSMM + +#ifdef PADDLE_MKL_SPLIT_GEMM + constexpr int bs = 2; + if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) { + for (int off = 0; off < M; off += bs) { + CBlas::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha, + A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc); + } + return; } #endif + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, + int N, int K, T alpha, const T *A, + const T *B, T beta, T *C) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + GEMM_WARP(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); } template <> @@ -222,9 +243,9 @@ void Blas::GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc) const { - CBlas::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, - lda, B, ldb, beta, C, ldc); + GEMM_WARP(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); } template diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 078dd448c385dbb8a00025ee2ba08d0c41a4730a..2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -228,3 +228,57 @@ TEST(math_funciton, set_constant) { } delete ctx; } + +template +void GemmWarpTest(int m, int n, int k, T alpha, T beta) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor mat_b; + paddle::framework::Tensor mat_c_ref; + paddle::framework::Tensor mat_c_mkl; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* A = mat_a.mutable_data({m, k}, *cpu_place); + T* B = mat_b.mutable_data({k, n}, *cpu_place); + T* CREF = mat_c_ref.mutable_data({m, n}, *cpu_place); + T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); + + ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel()); + for (int i = 0; i < mat_a.numel(); ++i) { + A[i] = static_cast(i); + } + for (int i = 0; i < mat_b.numel(); ++i) { + B[i] = static_cast(i + 1); + } + for (int i = 0; i < mat_c_ref.numel(); ++i) { + CREF[i] = static_cast(i + 2); + CMKL[i] = CREF[i]; + } + + // this would call gemm_warp + paddle::platform::CPUDeviceContext context(*cpu_place); + GetBlas(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, + beta, CREF); + + // lda,ldb,ldc follow RowMajor + int lda = k; + int ldb = n; + int ldc = n; + paddle::operators::math::CBlas::GEMM(CblasRowMajor, CblasNoTrans, + CblasNoTrans, m, n, k, alpha, A, lda, + B, ldb, beta, CMKL, ldc); + + for (int i = 0; i < mat_c_mkl.numel(); ++i) { + EXPECT_FLOAT_EQ(CREF[i], CMKL[i]); + } +} + +TEST(math_function, gemm_warp) { + GemmWarpTest(3, 2, 5, 1.f, 0.f); + GemmWarpTest(3, 2, 5, 2.f, 1.f); + GemmWarpTest(8, 5, 6, 1.f, 0.f); + GemmWarpTest(8, 5, 6, 2.f, 1.f); + GemmWarpTest(3, 2, 5, 1.0, 0.0); + GemmWarpTest(3, 2, 5, 2.0, 1.0); + GemmWarpTest(8, 5, 6, 1.0, 0.0); + GemmWarpTest(8, 5, 6, 2.0, 1.0); +}