Optimize gemm data package, it will bring 22% speedup for ocr detection model

ce0f2bfd · hjchen2 · c070770c · ce0f2bfd · ce0f2bfd
展开全部隐藏空白更改
内联并排

Showing with 448 addition and 449 deletion

src/operators/math/gemm.cpp src/operators/math/gemm.cpp +441 -430

src/operators/math/gemm.h src/operators/math/gemm.h +7 -19

未找到文件。
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -46,37 +46,25 @@ namespace math {
 class Gemm {
 public:
-  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
+  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *,
+                               const bool);
  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
                                 int);
  FnPack procPackA;
  FnPack procPackB;
  FnAddDot procAddDot;
-  // 将 A\B 矩阵分块复制到连续内存(RowMajor)
-  void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
+                      float *buffer, const bool parallel);
-  void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer);
  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer);
+                      float *buffer, const bool parallel);
-  void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer);
  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                      float *buffer);
+                      float *buffer, const bool parallel);
-  void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                          float *buffer);
 #if __aarch64__
  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer);
+                       float *buffer, const bool parallel);
-  void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer);
  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer);
+                       float *buffer, const bool parallel);
-  void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer);
 #endif
  // 分块矩阵乘法