From 8f5b0311a21fd934c8d6fade55de511d9e3ddcc7 Mon Sep 17 00:00:00 2001
From: zhaojiaying01 <zhaojiaying01@baidu.com>
Date: Thu, 24 May 2018 12:33:00 +0800
Subject: [PATCH] replace openplas with gemm

---
 CMakeLists.txt                            |  4 +-
 src/operators/math/{Gemm.cpp => gemm.cpp} | 77 +++++++++++++++++++----
 src/operators/math/{Gemm.h => gemm.h}     | 20 ++++--
 src/operators/math/math_function.cpp      | 57 +----------------
 src/operators/math/math_function.h        | 11 ----
 test/common/test_gemm.cpp.cpp             | 29 ++++-----
 6 files changed, 99 insertions(+), 99 deletions(-)
 rename src/operators/math/{Gemm.cpp => gemm.cpp} (71%)
 rename src/operators/math/{Gemm.h => gemm.h} (76%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c61e60d9e6..84ca93ac6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,8 +34,6 @@ if (ANDROID)
     link_directories(third-party/protobuf/armeabi-v7a)
 else()
     # link openblas
-    include_directories(third-party/openblas/include)
-    link_directories(third-party/openblas/lib)
     link_directories(third-party/protobuf/lib)
 endif ()
 
@@ -47,7 +45,7 @@ if (ANDROID)
     # openblas.a need log lib
     target_link_libraries(paddle-mobile protobuf-lite)
 else()
-    target_link_libraries(paddle-mobile protobuf-lite openblas)
+    target_link_libraries(paddle-mobile protobuf-lite)
 endif ()
 #add_dependencies(paddle-mobile openblas_proj)
 
diff --git a/src/operators/math/Gemm.cpp b/src/operators/math/gemm.cpp
similarity index 71%
rename from src/operators/math/Gemm.cpp
rename to src/operators/math/gemm.cpp
index 40fc7b105a..bc484c2f0d 100644
--- a/src/operators/math/Gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "operators/math/Gemm.h"
-#include <iostream>
+#include "operators/math/gemm.h"
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-// 将A矩阵分块复制到连续内存
+// 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
                  float *buffer) {
   int i, j;
@@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
   }
 }
 
-// 将B矩阵分块复制到连续内存
+// 将A矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+                  float *buffer) {
+  int i, j;
+  const float *Ai, *Ai1, *Ai2, *Ai3;
+  for (i = 0; i < m - paddingM; i += MR) {
+    Ai = &A(i, 0);
+    Ai1 = &A(i + 1, 0);
+    Ai2 = &A(i + 2, 0);
+    Ai3 = &A(i + 3, 0);
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *Ai++;
+      *buffer++ = *Ai1++;
+      *buffer++ = *Ai2++;
+      *buffer++ = *Ai3++;
+    }
+  }
+  if (paddingM != 0) {
+    for (j = 0; j < k; ++j) {
+      for (i = m - paddingM; i < m; ++i) {
+        *buffer++ = A(i, j);
+      }
+      for (i = m; i < m + (MR - paddingM); ++i) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 将B矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
                  float *buffer) {
   int i, j;
+  const float *Bj, *Bj1, *Bj2, *Bj3;
   for (j = 0; j < n - paddingN; j += NR) {
-    const float *Bj = &B(0, j);
-    const float *Bj1 = &B(0, j + 1);
-    const float *Bj2 = &B(0, j + 2);
-    const float *Bj3 = &B(0, j + 3);
+    Bj = &B(0, j);
+    Bj1 = &B(0, j + 1);
+    Bj2 = &B(0, j + 2);
+    Bj3 = &B(0, j + 3);
     for (i = 0; i < k; ++i) {
       *buffer++ = *Bj++;
       *buffer++ = *Bj1++;
@@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
   if (paddingN != 0) {
     for (i = 0; i < k; ++i) {
       for (int j = n - paddingN; j < n; ++j) {
-        const float *Bij = &B(i, j);
+        *buffer++ = B(i, j);
+      }
+      for (int j = n; j < n + (NR - paddingN); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 将B矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+                  float *buffer) {
+  int i, j;
+  const float *Bij;
+  for (j = 0; j < n - paddingN; j += NR) {
+    for (i = 0; i < k; ++i) {
+      Bij = &B(i, j);
+      *buffer++ = *Bij;
+      *buffer++ = *(Bij + 1);
+      *buffer++ = *(Bij + 2);
+      *buffer++ = *(Bij + 3);
+    }
+  }
+  if (paddingN != 0) {
+    for (i = 0; i < k; ++i) {
+      Bij = &B(i, n - paddingN);
+      for (int j = n - paddingN; j < n; ++j) {
         *buffer++ = *Bij++;
       }
       for (int j = n; j < n + (NR - paddingN); ++j) {
@@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
   static float packedB[KC * NC];
 
   if (first_time) {
-    PackMatrixB(k, n, _nc, B, ldb, packedB);
+    PackMatrixB_(k, n, _nc, B, ldb, packedB);
   }
-  PackMatrixA(m, k, _mc, A, lda, packedA);
+  PackMatrixA_(m, k, _mc, A, lda, packedA);
 
   int i, j, mc, nc;
 
diff --git a/src/operators/math/Gemm.h b/src/operators/math/gemm.h
similarity index 76%
rename from src/operators/math/Gemm.h
rename to src/operators/math/gemm.h
index 7c5f50f0de..2eea23a3b1 100644
--- a/src/operators/math/Gemm.h
+++ b/src/operators/math/gemm.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-// 矩阵取值运算宏，假设矩阵按列存储
-#define A(i, j) A[(j)*lda + (i)]
-#define B(i, j) B[(j)*ldb + (i)]
-#define C(i, j) C[(j)*ldc + (i)]
+// 矩阵取值运算宏，假设矩阵按行存储
+#define A(i, j) A[(i)*lda + (j)]
+#define B(i, j) B[(i)*ldb + (j)]
+#define C(i, j) C[(i)*ldc + (j)]
 
 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
 #define MC 384
@@ -32,14 +32,22 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-// 将 A 矩阵分块复制到连续内存
+// 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
                  float *buffer);
 
-// 将 B 矩阵分块复制到连续内存
+// 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
                  float *buffer);
 
+// 将 A 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+                  float *buffer);
+
+// 将 B 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+                  float *buffer);
+
 // 分块矩阵乘法
 void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
                  int ldb, float *C, int ldc, int first_time);
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index b1487bb0a8..b47d408a6f 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -13,54 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/math/math_function.h"
+#include "operators/math/gemm.h"
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K, const float alpha,
-                 const float *A, const float *B, const float beta, float *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const double alpha,
-                  const double *A, const double *B, const double beta,
-                  double *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<float>(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const bool transA, const bool transB, const int M,
-                  const int N, const int K, const double alpha, const double *A,
-                  const int lda, const double *B, const int ldb,
-                  const double beta, double *C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b, float alpha,
@@ -83,11 +41,8 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
   int N = dim_out[1];
   int K = (trans_a == false) ? dim_a[1] : dim_a[0];
 
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-              matrix_b.data<float>(), beta, matrix_out->data<float>());
+  sgemm(M, N, K, 1, matrix_a.data<float>(), K, matrix_b.data<float>(), N, 0,
+        matrix_out->data<float>(), N);
 }
 
 template <>
@@ -111,12 +66,6 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
   int M = dim_out[0];
   int N = dim_out[1];
   int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-               matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
 }  // namespace math
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index 44158dfee2..bf81fc88a0 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <cblas.h>
 #include <cmath>
 #include "framework/tensor.h"
 
@@ -22,16 +21,6 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-template <typename T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K, const T alpha, const T *A,
-          const T *B, const T beta, T *C);
-
-template <typename T>
-void gemm(const bool transA, const bool transB, const int M, const int N,
-          const int K, const T alpha, const T *A, const int lda, const T *B,
-          const int ldb, const T beta, T *C, const int ldc);
-
 // matrix multiply with continuous memory
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
diff --git a/test/common/test_gemm.cpp.cpp b/test/common/test_gemm.cpp.cpp
index f189e53f41..0e32a87c72 100644
--- a/test/common/test_gemm.cpp.cpp
+++ b/test/common/test_gemm.cpp.cpp
@@ -14,24 +14,25 @@ limitations under the License. */
 
 #include <iostream>
 #include "common/log.h"
-#include "operators/math/Gemm.h"
+#include "operators/math/gemm.h"
 
-#define a(i, j) a[(j)*lda + (i)]
-#define b(i, j) b[(j)*ldb + (i)]
-#define c1(i, j) c1[(j)*ldc + (i)]
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 7
+#define n 7
+#define k 7
 
 int main() {
-  int m = 45;
-  int n = 46;
-  int k = 125;
-  int lda = m;
-  int ldb = k;
-  int ldc = m;
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
 
-  float a[45 * 125];
-  float b[125 * 46];
-  float c[45 * 46] = {0};
-  float c1[45 * 46] = {0};
+  float a[7 * 7];
+  float b[7 * 7];
+  float c[7 * 7] = {0};
+  float c1[7 * 7] = {0};
   for (int i = 0; i < m * k; ++i) {
     a[i] = 2;
   }
-- 
GitLab