Optimize gemm data package, it will bring 22% speedup for ocr detection model

e4615bde · hjchen2 · 7a507e95 · e4615bde · e4615bde
展开全部隐藏空白更改
内联并排

Showing with 448 addition and 449 deletion

src/operators/math/gemm.cpp src/operators/math/gemm.cpp +441 -430

src/operators/math/gemm.h src/operators/math/gemm.h +7 -19

未找到文件。
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -46,37 +46,25 @@ namespace math {

 class Gemm {
 public:
-  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
+  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *,
+                               const bool);
  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
                                 int);
  FnPack procPackA;
  FnPack procPackB;
  FnAddDot procAddDot;

-  // 将 A\B 矩阵分块复制到连续内存(RowMajor)
-  void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
-  void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer);
+                      float *buffer, const bool parallel);
  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
-  void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer);
+                      float *buffer, const bool parallel);
  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                      float *buffer);
-  void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                          float *buffer);
+                      float *buffer, const bool parallel);
 #if __aarch64__
  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer);
-  void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer);
+                       float *buffer, const bool parallel);
  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer);
-  void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer);
+                       float *buffer, const bool parallel);
 #endif

  // 分块矩阵乘法