replace openplas with gemm

8f5b0311 · zhaojiaying01 · e639ef38 · 8f5b0311 · 8f5b0311 · 8f5b0311
6 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,8 +34,6 @@ if (ANDROID)
    link_directories(third-party/protobuf/armeabi-v7a)
 else()
    # link openblas
-    include_directories(third-party/openblas/include)
-    link_directories(third-party/openblas/lib)
    link_directories(third-party/protobuf/lib)
 endif ()

@@ -47,7 +45,7 @@ if (ANDROID)
    # openblas.a need log lib
    target_link_libraries(paddle-mobile protobuf-lite)
 else()
-    target_link_libraries(paddle-mobile protobuf-lite openblas)
+    target_link_libraries(paddle-mobile protobuf-lite)
 endif ()
 #add_dependencies(paddle-mobile openblas_proj)


--- a/src/operators/math/Gemm.cpp
+++ b/src/operators/math/Gemm.cpp
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "operators/math/Gemm.h"
-#include <iostream>
+#include "operators/math/gemm.h"

 namespace paddle_mobile {
 namespace operators {
 namespace math {
-// 将A矩阵分块复制到连续内存
+// 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
                 float *buffer) {
  int i, j;
@@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
  }
 }

-// 将B矩阵分块复制到连续内存
+// 将A矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+                  float *buffer) {
+  int i, j;
+  const float *Ai, *Ai1, *Ai2, *Ai3;
+  for (i = 0; i < m - paddingM; i += MR) {
+    Ai = &A(i, 0);
+    Ai1 = &A(i + 1, 0);
+    Ai2 = &A(i + 2, 0);
+    Ai3 = &A(i + 3, 0);
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *Ai++;
+      *buffer++ = *Ai1++;
+      *buffer++ = *Ai2++;
+      *buffer++ = *Ai3++;
+    }
+  }
+  if (paddingM != 0) {
+    for (j = 0; j < k; ++j) {
+      for (i = m - paddingM; i < m; ++i) {
+        *buffer++ = A(i, j);
+      }
+      for (i = m; i < m + (MR - paddingM); ++i) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 将B矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
                 float *buffer) {
  int i, j;
+  const float *Bj, *Bj1, *Bj2, *Bj3;
  for (j = 0; j < n - paddingN; j += NR) {
-    const float *Bj = &B(0, j);
-    const float *Bj1 = &B(0, j + 1);
-    const float *Bj2 = &B(0, j + 2);
-    const float *Bj3 = &B(0, j + 3);
+    Bj = &B(0, j);
+    Bj1 = &B(0, j + 1);
+    Bj2 = &B(0, j + 2);
+    Bj3 = &B(0, j + 3);
    for (i = 0; i < k; ++i) {
      *buffer++ = *Bj++;
      *buffer++ = *Bj1++;
@@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
  if (paddingN != 0) {
    for (i = 0; i < k; ++i) {
      for (int j = n - paddingN; j < n; ++j) {
-        const float *Bij = &B(i, j);
+        *buffer++ = B(i, j);
+      }
+      for (int j = n; j < n + (NR - paddingN); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 将B矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+                  float *buffer) {
+  int i, j;
+  const float *Bij;
+  for (j = 0; j < n - paddingN; j += NR) {
+    for (i = 0; i < k; ++i) {
+      Bij = &B(i, j);
+      *buffer++ = *Bij;
+      *buffer++ = *(Bij + 1);
+      *buffer++ = *(Bij + 2);
+      *buffer++ = *(Bij + 3);
+    }
+  }
+  if (paddingN != 0) {
+    for (i = 0; i < k; ++i) {
+      Bij = &B(i, n - paddingN);
+      for (int j = n - paddingN; j < n; ++j) {
        *buffer++ = *Bij++;
      }
      for (int j = n; j < n + (NR - paddingN); ++j) {
@@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
  static float packedB[KC * NC];

  if (first_time) {
-    PackMatrixB(k, n, _nc, B, ldb, packedB);
+    PackMatrixB_(k, n, _nc, B, ldb, packedB);
  }
-  PackMatrixA(m, k, _mc, A, lda, packedA);
+  PackMatrixA_(m, k, _mc, A, lda, packedA);

  int i, j, mc, nc;


--- a/src/operators/math/Gemm.h
+++ b/src/operators/math/Gemm.h
@@ -14,10 +14,10 @@ limitations under the License. */

 #pragma once

-// 矩阵取值运算宏，假设矩阵按列存储
-#define A(i, j) A[(j)*lda + (i)]
-#define B(i, j) B[(j)*ldb + (i)]
-#define C(i, j) C[(j)*ldc + (i)]
+// 矩阵取值运算宏，假设矩阵按行存储
+#define A(i, j) A[(i)*lda + (j)]
+#define B(i, j) B[(i)*ldb + (j)]
+#define C(i, j) C[(i)*ldc + (j)]

 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
 #define MC 384
@@ -32,14 +32,22 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-// 将 A 矩阵分块复制到连续内存
+// 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
                 float *buffer);

-// 将 B 矩阵分块复制到连续内存
+// 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
                 float *buffer);

+// 将 A 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+                  float *buffer);
+
+// 将 B 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+                  float *buffer);
+
 // 分块矩阵乘法
 void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
                 int ldb, float *C, int ldc, int first_time);

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -13,54 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/math/math_function.h"
+#include "operators/math/gemm.h"

 namespace paddle_mobile {
 namespace operators {
 namespace math {

-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K, const float alpha,
-                 const float *A, const float *B, const float beta, float *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const double alpha,
-                  const double *A, const double *B, const double beta,
-                  double *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<float>(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const bool transA, const bool transB, const int M,
-                  const int N, const int K, const double alpha, const double *A,
-                  const int lda, const double *B, const int ldb,
-                  const double beta, double *C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
@@ -83,11 +41,8 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];

-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-              matrix_b.data<float>(), beta, matrix_out->data<float>());
+  sgemm(M, N, K, 1, matrix_a.data<float>(), K, matrix_b.data<float>(), N, 0,
+        matrix_out->data<float>(), N);
 }

 template <>
@@ -111,12 +66,6 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-               matrix_b.data<double>(), beta, matrix_out->data<double>());
 }

 }  // namespace math

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <cblas.h>
 #include <cmath>
 #include "framework/tensor.h"

@@ -22,16 +21,6 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-template <typename T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K, const T alpha, const T *A,
-          const T *B, const T beta, T *C);
-
-template <typename T>
-void gemm(const bool transA, const bool transB, const int M, const int N,
-          const int K, const T alpha, const T *A, const int lda, const T *B,
-          const int ldb, const T beta, T *C, const int ldc);
-
 // matrix multiply with continuous memory
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,

--- a/test/common/test_gemm.cpp.cpp
+++ b/test/common/test_gemm.cpp.cpp
@@ -14,24 +14,25 @@ limitations under the License. */

 #include <iostream>
 #include "common/log.h"
-#include "operators/math/Gemm.h"
+#include "operators/math/gemm.h"

-#define a(i, j) a[(j)*lda + (i)]
-#define b(i, j) b[(j)*ldb + (i)]
-#define c1(i, j) c1[(j)*ldc + (i)]
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 7
+#define n 7
+#define k 7

 int main() {
-  int m = 45;
-  int n = 46;
-  int k = 125;
-  int lda = m;
-  int ldb = k;
-  int ldc = m;
+  int lda = k;
+  int ldb = n;
+  int ldc = n;

-  float a[45 * 125];
-  float b[125 * 46];
-  float c[45 * 46] = {0};
-  float c1[45 * 46] = {0};
+  float a[7 * 7];
+  float b[7 * 7];
+  float c[7 * 7] = {0};
+  float c1[7 * 7] = {0};
  for (int i = 0; i < m * k; ++i) {
    a[i] = 2;
  }