From 8f5b0311a21fd934c8d6fade55de511d9e3ddcc7 Mon Sep 17 00:00:00 2001 From: zhaojiaying01 Date: Thu, 24 May 2018 12:33:00 +0800 Subject: [PATCH] replace openplas with gemm --- CMakeLists.txt | 4 +- src/operators/math/{Gemm.cpp => gemm.cpp} | 77 +++++++++++++++++++---- src/operators/math/{Gemm.h => gemm.h} | 20 ++++-- src/operators/math/math_function.cpp | 57 +---------------- src/operators/math/math_function.h | 11 ---- test/common/test_gemm.cpp.cpp | 29 ++++----- 6 files changed, 99 insertions(+), 99 deletions(-) rename src/operators/math/{Gemm.cpp => gemm.cpp} (71%) rename src/operators/math/{Gemm.h => gemm.h} (76%) diff --git a/CMakeLists.txt b/CMakeLists.txt index c61e60d9e6..84ca93ac6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,8 +34,6 @@ if (ANDROID) link_directories(third-party/protobuf/armeabi-v7a) else() # link openblas - include_directories(third-party/openblas/include) - link_directories(third-party/openblas/lib) link_directories(third-party/protobuf/lib) endif () @@ -47,7 +45,7 @@ if (ANDROID) # openblas.a need log lib target_link_libraries(paddle-mobile protobuf-lite) else() - target_link_libraries(paddle-mobile protobuf-lite openblas) + target_link_libraries(paddle-mobile protobuf-lite) endif () #add_dependencies(paddle-mobile openblas_proj) diff --git a/src/operators/math/Gemm.cpp b/src/operators/math/gemm.cpp similarity index 71% rename from src/operators/math/Gemm.cpp rename to src/operators/math/gemm.cpp index 40fc7b105a..bc484c2f0d 100644 --- a/src/operators/math/Gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "operators/math/Gemm.h" -#include +#include "operators/math/gemm.h" namespace paddle_mobile { namespace operators { namespace math { -// 将A矩阵分块复制到连续内存 +// 将A矩阵分块复制到连续内存(ColMajor) void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, float *buffer) { int i, j; @@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, } } -// 将B矩阵分块复制到连续内存 +// 将A矩阵分块复制到连续内存(RowMajor) +void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, + float *buffer) { + int i, j; + const float *Ai, *Ai1, *Ai2, *Ai3; + for (i = 0; i < m - paddingM; i += MR) { + Ai = &A(i, 0); + Ai1 = &A(i + 1, 0); + Ai2 = &A(i + 2, 0); + Ai3 = &A(i + 3, 0); + for (int j = 0; j < k; ++j) { + *buffer++ = *Ai++; + *buffer++ = *Ai1++; + *buffer++ = *Ai2++; + *buffer++ = *Ai3++; + } + } + if (paddingM != 0) { + for (j = 0; j < k; ++j) { + for (i = m - paddingM; i < m; ++i) { + *buffer++ = A(i, j); + } + for (i = m; i < m + (MR - paddingM); ++i) { + *buffer++ = 0; + } + } + } +} + +// 将B矩阵分块复制到连续内存(ColMajor) void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, float *buffer) { int i, j; + const float *Bj, *Bj1, *Bj2, *Bj3; for (j = 0; j < n - paddingN; j += NR) { - const float *Bj = &B(0, j); - const float *Bj1 = &B(0, j + 1); - const float *Bj2 = &B(0, j + 2); - const float *Bj3 = &B(0, j + 3); + Bj = &B(0, j); + Bj1 = &B(0, j + 1); + Bj2 = &B(0, j + 2); + Bj3 = &B(0, j + 3); for (i = 0; i < k; ++i) { *buffer++ = *Bj++; *buffer++ = *Bj1++; @@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, if (paddingN != 0) { for (i = 0; i < k; ++i) { for (int j = n - paddingN; j < n; ++j) { - const float *Bij = &B(i, j); + *buffer++ = B(i, j); + } + for (int j = n; j < n + (NR - paddingN); ++j) { + *buffer++ = 0; + } + } + } +} + +// 将B矩阵分块复制到连续内存(RowMajor) +void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, + float *buffer) { + int i, j; + const float *Bij; + for (j = 0; j < n - paddingN; j += NR) { + for (i = 0; i < k; ++i) { + Bij = &B(i, j); + *buffer++ = *Bij; + *buffer++ = *(Bij + 1); + *buffer++ = *(Bij + 2); + *buffer++ = *(Bij + 3); + } + } + if (paddingN != 0) { + for (i = 0; i < k; ++i) { + Bij = &B(i, n - paddingN); + for (int j = n - paddingN; j < n; ++j) { *buffer++ = *Bij++; } for (int j = n; j < n + (NR - paddingN); ++j) { @@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, static float packedB[KC * NC]; if (first_time) { - PackMatrixB(k, n, _nc, B, ldb, packedB); + PackMatrixB_(k, n, _nc, B, ldb, packedB); } - PackMatrixA(m, k, _mc, A, lda, packedA); + PackMatrixA_(m, k, _mc, A, lda, packedA); int i, j, mc, nc; diff --git a/src/operators/math/Gemm.h b/src/operators/math/gemm.h similarity index 76% rename from src/operators/math/Gemm.h rename to src/operators/math/gemm.h index 7c5f50f0de..2eea23a3b1 100644 --- a/src/operators/math/Gemm.h +++ b/src/operators/math/gemm.h @@ -14,10 +14,10 @@ limitations under the License. */ #pragma once -// 矩阵取值运算宏,假设矩阵按列存储 -#define A(i, j) A[(j)*lda + (i)] -#define B(i, j) B[(j)*ldb + (i)] -#define C(i, j) C[(j)*ldc + (i)] +// 矩阵取值运算宏,假设矩阵按行存储 +#define A(i, j) A[(i)*lda + (j)] +#define B(i, j) B[(i)*ldb + (j)] +#define C(i, j) C[(i)*ldc + (j)] // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k #define MC 384 @@ -32,14 +32,22 @@ namespace paddle_mobile { namespace operators { namespace math { -// 将 A 矩阵分块复制到连续内存 +// 将 A 矩阵分块复制到连续内存(ColMajor) void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, float *buffer); -// 将 B 矩阵分块复制到连续内存 +// 将 B 矩阵分块复制到连续内存(ColMajor) void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, float *buffer); +// 将 A 矩阵分块复制到连续内存(RowMajor) +void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, + float *buffer); + +// 将 B 矩阵分块复制到连续内存(RowMajor) +void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, + float *buffer); + // 分块矩阵乘法 void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, int ldb, float *C, int ldc, int first_time); diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index b1487bb0a8..b47d408a6f 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -13,54 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/math/math_function.h" +#include "operators/math/gemm.h" namespace paddle_mobile { namespace operators { namespace math { -template <> -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const float alpha, - const float *A, const float *B, const float beta, float *C) { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - -template <> -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const double alpha, - const double *A, const double *B, const double beta, - double *C) { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - -template <> -void gemm(const bool transA, const bool transB, const int M, const int N, - const int K, const float alpha, const float *A, const int lda, - const float *B, const int ldb, const float beta, float *C, - const int ldc) { - cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, - lda, B, ldb, beta, C, ldc); -} - -template <> -void gemm(const bool transA, const bool transB, const int M, - const int N, const int K, const double alpha, const double *A, - const int lda, const double *B, const int ldb, - const double beta, double *C, const int ldc) { - cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, - lda, B, ldb, beta, C, ldc); -} - template <> void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha, @@ -83,11 +41,8 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int N = dim_out[1]; int K = (trans_a == false) ? dim_a[1] : dim_a[0]; - CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; - CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - - gemm(transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data()); + sgemm(M, N, K, 1, matrix_a.data(), K, matrix_b.data(), N, 0, + matrix_out->data(), N); } template <> @@ -111,12 +66,6 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int M = dim_out[0]; int N = dim_out[1]; int K = (trans_a == false) ? dim_a[1] : dim_a[0]; - - CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; - CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - - gemm(transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data()); } } // namespace math diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 44158dfee2..bf81fc88a0 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include "framework/tensor.h" @@ -22,16 +21,6 @@ namespace paddle_mobile { namespace operators { namespace math { -template -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, const T *A, - const T *B, const T beta, T *C); - -template -void gemm(const bool transA, const bool transB, const int M, const int N, - const int K, const T alpha, const T *A, const int lda, const T *B, - const int ldb, const T beta, T *C, const int ldc); - // matrix multiply with continuous memory template void matmul(const framework::Tensor &matrix_a, bool trans_a, diff --git a/test/common/test_gemm.cpp.cpp b/test/common/test_gemm.cpp.cpp index f189e53f41..0e32a87c72 100644 --- a/test/common/test_gemm.cpp.cpp +++ b/test/common/test_gemm.cpp.cpp @@ -14,24 +14,25 @@ limitations under the License. */ #include #include "common/log.h" -#include "operators/math/Gemm.h" +#include "operators/math/gemm.h" -#define a(i, j) a[(j)*lda + (i)] -#define b(i, j) b[(j)*ldb + (i)] -#define c1(i, j) c1[(j)*ldc + (i)] +#define a(i, j) a[(i)*lda + (j)] +#define b(i, j) b[(i)*ldb + (j)] +#define c1(i, j) c1[(i)*ldc + (j)] + +#define m 7 +#define n 7 +#define k 7 int main() { - int m = 45; - int n = 46; - int k = 125; - int lda = m; - int ldb = k; - int ldc = m; + int lda = k; + int ldb = n; + int ldc = n; - float a[45 * 125]; - float b[125 * 46]; - float c[45 * 46] = {0}; - float c1[45 * 46] = {0}; + float a[7 * 7]; + float b[7 * 7]; + float c[7 * 7] = {0}; + float c1[7 * 7] = {0}; for (int i = 0; i < m * k; ++i) { a[i] = 2; } -- GitLab