Merge pull request #1081 from xiebaiyuan/develop

trans gemm to class && add multi instance support && to unit test

Merge pull request #1081 from xiebaiyuan/develop
trans gemm to class && add multi instance support && to unit test
d51a0718 · xiebaiyuan · GitHub · f90dd802 · a058f56d · d51a0718
7 changed file
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-int MC = 0;
+/*int MC = 0;
 int KC = 0;
 int NC = 0;
@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
 FnPack procPackA;
 FnPack procPackB;
-FnAddDot procAddDot;
+FnAddDot procAddDot;*/
 /*
 // 将A矩阵分块复制到连续内存(ColMajor)
@@ -101,8 +101,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+void Gemm::PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+                          float *buffer) {
  const float *a0, *a1, *a2, *a3;
  for (int i = 0; i < m - m_tail; i += MR) {
    a0 = A + i * lda;
@@ -142,8 +142,8 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
  }
 }
-void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+                          float *buffer) {
  const int i_length = m - m_tail;
  for (int i = 0; i < i_length; i += MR) {
    const float *a0 = A + i * lda;
@@ -196,8 +196,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
  }
 }
-void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+void Gemm::PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer) {
+                              float *buffer) {
  const int i_length = m - m_tail;
 #pragma omp parallel for
  for (int i = 0; i < i_length; i += MR) {
@@ -251,8 +251,8 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
  }
 }
-void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+void Gemm::PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+                          float *buffer) {
  const int i_length = m - m_tail;
  for (int i = 0; i < i_length; i += MR) {
    const float *a0 = A + i * lda;
@@ -317,8 +317,8 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
  }
 }
-void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+void Gemm::PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer) {
+                              float *buffer) {
  const int i_length = m - m_tail;
 #pragma omp parallel for
  for (int i = 0; i < i_length; i += MR) {
@@ -385,8 +385,8 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
 }
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                    float *buffer) {
+                          float *buffer) {
  const int j_length = n - n_tail;
  for (int j = 0; j < j_length; j += NR) {
    float *local_buffer = buffer + j * k;
@@ -436,8 +436,8 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
  }
 }
-void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                        float *buffer) {
+                              float *buffer) {
  const int j_length = n - n_tail;
 #pragma omp parallel for
  for (int j = 0; j < j_length; j += NR) {
@@ -489,8 +489,8 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
 }
 #if __aarch64__
-void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer) {
+                           float *buffer) {
  const int j_length = n - n_tail;
  for (int j = 0; j < j_length; j += NR) {
    float *local_buffer = buffer + j * k;
@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
  }
 }
-void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B,
-                         float *buffer) {
+                               int ldb, float *buffer) {
  const int j_length = n - n_tail;
 #pragma omp parallel for
  for (int j = 0; j < j_length; j += NR) {
@@ -550,8 +550,8 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
  }
 }
-void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer) {
+                           float *buffer) {
  const int j_length = n - n_tail;
  for (int j = 0; j < n - n_tail; j += NR) {
    float *local_buffer = buffer + j * k;
@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
  }
 }
-void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+void Gemm::PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B,
-                         float *buffer) {
+                               int ldb, float *buffer) {
  const int j_length = n - n_tail;
 #pragma omp parallel for
  for (int j = 0; j < n - n_tail; j += NR) {
@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
 #endif  // __aarch64__
 // 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
-                 float beta, float *c, float *C, int ldc, bool relu) {
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
@@ -648,9 +649,9 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 }
 // 分块矩阵乘法
-void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                         const float *b, float beta, float *c, float *C,
+                               const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *bias) {
+                               int ldc, bool relu, float *bias) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
 }
 // 分块矩阵乘法
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
+                             const float *b, float beta, float *c, float *C,
-                       bool relu, float *new_scale, float *new_bias) {
+                             int ldc, bool relu, float *new_scale,
+                             float *new_bias) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 }
 // 分块矩阵乘法
-void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                          const float *b, float beta, float *c, float *C,
+                                const float *b, float beta, float *c, float *C,
-                          int ldc, bool relu, float *new_scale, float *new_bias,
+                                int ldc, bool relu, float *new_scale,
-                          float *bias) {
+                                float *new_bias, float *bias) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
@@ -737,9 +739,9 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
  WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
 }
-void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                          float *c, float *C, int ldc, float *p,
+                                float *c, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1) {
+                                std::string mode, float *bias, float *bias1) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
 #if __ARM_NEON
 #if __aarch64__
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
 }
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
 // 分块矩阵乘法结果回写
 // C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 // C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
  }
 }
 // C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
 }
 // C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -996,8 +999,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B + bias, relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {
+                              float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
 }
 // C = A * B + C,prelu(C)
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                       std::string mode, float *bias, float *bias1) {
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
 }
 // C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                 float *new_bias) {
+                       float *new_scale, float *new_bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -1159,8 +1163,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 }
 // C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias) {
+                           float *new_scale, float *new_bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -1205,8 +1209,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
 }
 // C = A * B, batchnorm(C),C = C + bias; relu(C)
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias) {
+                              float *new_scale, float *new_bias, float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 #else
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
 }
 /*
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
-                  const float *B, int ldb, float beta, float *C, int ldc,
+lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
-                  bool relu) {
+*bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
  float *c0, *C0;
@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                        int lda, const float *B, int ldb, float beta, float *C,
                        int ldc, bool relu, float *new_scale, float *new_bias) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 }
 */
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
 }
 // C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 // C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
 }
 // C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -2108,8 +2112,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B + bias, relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {
+                              float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
  }
 }
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                       std::string mode, float *bias, float *bias1) {
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {
  if (nc < 4) {
    if (bias1 == nullptr) {
      for (int i = 0; i < mc; ++i) {
@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
 }
 // C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                 float *bias) {
+                       float *scale, float *bias) {
  if (nc < 4) {
    for (int i = 0; i < mc; ++i) {
      for (int j = 0; j < nc; ++j) {
@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *bias) {
+                           float *scale, float *bias) {
  if (nc < 4) {
    for (int i = 0; i < mc; ++i) {
      for (int j = 0; j < nc; ++j) {
@@ -2595,8 +2600,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B, batchnorm(C),C = C + bias; relu(C)
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias) {
+                              float *new_scale, float *new_bias, float *bias) {
  int nc1 = nc / 4;
  int _nc1 = nc % 4;
@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  /*
  // C = A * B
-  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
    int nc1 = n / 16;
    int _nc1 = n % 16;
    int nc2 = _nc1 / 4;
@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
  // C = alpha * A * B + beta * C
-  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+  void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
  // C = A * B + C
-  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
    int nc1 = n / 16;
    int _nc1 = n % 16;
@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
  // C = A * B + C, relu(C)
-  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
    int nc1 = n / 16;
    int _nc1 = n % 16;
@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
  // C = A * B, batchnorm(C)
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+  void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
                      float *bias) {
    int nc1 = n / 16;
    int _nc1 = n % 16;
@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
  // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+  void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
-                          float *bias) {
+  *scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
-    int nc1 = n / 16;
+  4; int nc3 = 16 - 4 * (_nc1 % 4);
-    int _nc1 = n % 16;
-    int nc2 = _nc1 / 4;
-    int nc3 = 16 - 4 * (_nc1 % 4);
    asm volatile(
        "vmov.f32   q14,      #0.0          \n\t"
@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 #endif  // __aarch64__
 #else
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  float *c0, *c1, *c2, *c3;
  c0 = c;
  c1 = c + ldc;
@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  }
 }
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {}
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+}
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {}
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {}
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {}
+                              float *bias) {}
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                       std::string mode, float *bias, float *bias1) {}
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {}
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                 float *new_bias) {}
+                       float *new_scale, float *new_bias) {}
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias) {}
+                           float *new_scale, float *new_bias) {}
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias1) {}
+                              float *new_scale, float *new_bias, float *bias1) {
+}
 #endif  // __ARM_NEON
 // 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+                 const float *B, int ldb, float beta, float *C, int ldc,
-           float *bias) {
+                 bool relu, float *bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 32 * 1024;
@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
-                 const float *B, int ldb, float beta, float *C, int ldc,
+                       int lda, const float *B, int ldb, float beta, float *C,
-                 bool relu, float *new_scale, float *new_bias, float *bias) {
+                       int ldc, bool relu, float *new_scale, float *new_bias,
+                       float *bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 32 * 1024;
@@ -3136,9 +3143,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                    const float *B, int ldb, float *C, int ldc, float *p,
+                          const float *B, int ldb, float *C, int ldc, float *p,
-                    std::string mode, float *bias, float *bias1) {
+                          std::string mode, float *bias, float *bias1) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 32 * 1024;
@@ -3212,9 +3219,9 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
 }
 // 32位 float 矩阵乘法
-void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-               const float *B, int ldb, float beta, float *C, int ldc,
+                     const float *B, int ldb, float beta, float *C, int ldc,
-               bool relu, float *bias) {
+                     bool relu, float *bias) {
 #ifdef _OPENMP
  int max_threads = omp_get_max_threads();
 #else
@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
    NC = (n + NR - 1) / NR * NR;
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
  } else {
@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
    MC = (m + MR - 1) / MR * MR;
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
+    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procAddDot = AddDot6x8;
+    procPackB = &Gemm::PackMatrixB_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
  }
@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
      mc = s_min(m - i, MC);
      float *local_A = packedA + MC * KC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
                          &C(i, 0), ldc, relu, bias + i);
    }
@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
      nc = s_min(n - j, NC);
      float *local_B = packedB + KC * NC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
      InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
                          &C(0, j), ldc, relu, bias);
    }
@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-                     const float *B, int ldb, float beta, float *C, int ldc,
+                           int lda, const float *B, int ldb, float beta,
-                     bool relu, float *new_scale, float *new_bias,
+                           float *C, int ldc, bool relu, float *new_scale,
-                     float *bias) {
+                           float *new_bias, float *bias) {
 #ifdef _OPENMP
  int max_threads = omp_get_max_threads();
 #else
@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
    NC = (n + NR - 1) / NR * NR;
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
  } else {
@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
    MC = (m + MR - 1) / MR * MR;
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
+    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
+    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = AddDot6x8;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
  }
@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
      mc = s_min(m - i, MC);
      float *local_A = packedA + MC * KC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
      if (bias == nullptr) {
        InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
                          &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
      nc = s_min(n - j, NC);
      float *local_B = packedB + KC * NC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
      if (bias == nullptr) {
        InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
                          &C(0, j), ldc, relu, new_scale, new_bias);
@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                        const float *B, int ldb, float *C, int ldc, float *p,
+                              const float *B, int ldb, float *C, int ldc,
-                        std::string mode, float *bias, float *bias1) {
+                              float *p, std::string mode, float *bias,
+                              float *bias1) {
 #ifdef _OPENMP
  int max_threads = omp_get_max_threads();
 #else
@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
    NC = (n + NR - 1) / NR * NR;
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
+    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
  } else {
@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
    MC = (m + MR - 1) / MR * MR;
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
+    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
+    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = AddDot6x8;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
    packedA = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
    packedB = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
  }
@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
      mc = s_min(m - i, MC);
      float *local_A = packedA + MC * KC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
      if (bias1 == nullptr) {
        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
                             p + i, mode, bias + i, nullptr);
@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
      nc = s_min(n - j, NC);
      float *local_B = packedB + KC * NC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
      if (bias1 == nullptr) {
        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
                             mode, bias, nullptr);
@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 #if __ARM_NEON
 #if __aarch64__
@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 }
 #if __aarch64__
-void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
+                      int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
 }
-void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
+                      int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -35,146 +35,166 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
-/*
+class Gemm {
+ public:
+  /*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-                 float *buffer);
+           float *buffer);
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                 float *buffer);
+           float *buffer);
 */
+  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
-// 将 A 矩阵分块复制到连续内存(RowMajor)
+  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
-void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                                 int);
-                    float *buffer);
+  FnPack procPackA;
-void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+  FnPack procPackB;
-                    float *buffer);
+  FnAddDot procAddDot;
-void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
+  // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+  void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
+                      float *buffer);
-void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
+                      float *buffer);
+  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-// 将 B 矩阵分块复制到连续内存(RowMajor)
+                      float *buffer);
-void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
+                          float *buffer);
-void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                     float *buffer);
+                          float *buffer);
-void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer);
+  // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                        float *buffer);
+                      float *buffer);
-void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
+                       float *buffer);
-void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
+                       float *buffer);
+  void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-// 分块矩阵乘法
+                          float *buffer);
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+  void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                 float beta, float *c, float *C, int ldc, bool relu);
+                           float *buffer);
-void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+  void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer);
+  // 分块矩阵乘法
+  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                   float beta, float *c, float *C, int ldc, bool relu);
+  void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                           const float *b, float beta, float *c, float *C,
+                           int ldc, bool relu, float *bias);
+  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                         const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *bias);
+                         int ldc, bool relu, float *new_scale, float *new_bias);
+  void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                            const float *b, float beta, float *c, float *C,
-                       const float *b, float beta, float *c, float *C, int ldc,
+                            int ldc, bool relu, float *new_scale,
-                       bool relu, float *new_scale, float *new_bias);
+                            float *new_bias, float *bias);
-void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                          const float *b, float beta, float *c, float *C,
+                            float *c, float *C, int ldc, float *p,
-                          int ldc, bool relu, float *new_scale, float *new_bias,
+                            std::string mode, float *bias, float *bias1);
+  /*
+  // 向量矩阵乘法 (M = 1)
+  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                    const float *B, int ldb, float beta, float *C, int ldc,
+                    bool relu);
+  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                          int lda, const float *B, int ldb, float beta, float
+  *C, int ldc, bool relu, float *new_scale, float *new_bias);
+  */
+  // 计算一个更小的 C 矩阵分块
+  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+  // 分块矩阵乘法结果回写
+  // C = A * B
+  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C
+  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C,prelu(C)
+  void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                         std::string mode, float *bias, float *bias1);
+  // C = A * B + bias ,relu(C)
+  void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
                          float *bias);
-void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+  // C = A * B, batchnorm(C)
-                          float *c, float *C, int ldc, float *p,
+  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                          std::string mode, float *bias, float *bias1);
+                   float *new_scale, float *new_bias);
-/*
+  // C = A * B, batchnorm(C), relu(C)
-// 向量矩阵乘法 (M = 1)
+  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                       float *new_scale, float *new_bias);
-                  const float *B, int ldb, float beta, float *C, int ldc,
+  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                  bool relu);
+                          float *new_scale, float *new_bias, float *bias1);
+  /*
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+  // 向量矩阵乘法结果回写
-                        int lda, const float *B, int ldb, float beta, float *C,
+  // C = A * B
-                        int ldc, bool relu, float *new_scale, float *new_bias);
+  void VecWriteBasic(int n, float *c, float *C, int ldc);
-*/
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                      float *new_bias);
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                          float *new_bias);
+  */
+  // 32位 float 矩阵乘法
+  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+             float *bias);
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
+  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                   const float *B, int ldb, float beta, float *C, int ldc,
+                   bool relu, float *new_scale, float *new_bias, float *bias);
+  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                      const float *B, int ldb, float *C, int ldc, float *p,
+                      std::string mode, float *bias, float *bias1);
+  // 32位 float 矩阵乘法（openmp 多线程版本）
+  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *bias);
-// 计算一个更小的 C 矩阵分块
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+                       int lda, const float *B, int ldb, float beta, float *C,
-void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+                       int ldc, bool relu, float *new_scale, float *new_bias,
-void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+                       float *bias);
-void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+  void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-// 分块矩阵乘法结果回写
+                          const float *B, int ldb, float *C, int ldc, float *p,
-// C = A * B
+                          std::string mode, float *bias, float *bias1);
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
-// C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C,prelu(C)
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1);
-// C = A * B + bias ,relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias);
-// C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
-                 float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias);
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias1);
-/*
-// 向量矩阵乘法结果回写
-// C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-// C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-// C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                    float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                        float *new_bias);
-*/
-// 32位 float 矩阵乘法
+ private:
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+  int MC = 0;
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+  int KC = 0;
-           float *bias);
+  int NC = 0;
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+  float *packedA;
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+  float *packedB;
-                 const float *B, int ldb, float beta, float *C, int ldc,
+  float *packedC;
-                 bool relu, float *new_scale, float *new_bias, float *bias);
+  float *zero;
-void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+};
-                    const float *B, int ldb, float *C, int ldc, float *p,
-                    std::string mode, float *bias, float *bias1);
-// 32位 float 矩阵乘法（openmp 多线程版本）
-void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-               const float *B, int ldb, float beta, float *C, int ldc,
-               bool relu, float *bias);
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *new_scale, float *new_bias, float *bias);
-void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                        const float *B, int ldb, float *C, int ldc, float *p,
-                        std::string mode, float *bias, float *bias1);
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
                      const ActivationType active_node,
                      const ActivationType active_gate) {
+    Gemm gemm;
    if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value,
+      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
-            frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value,
+                 value.prev_out_value, frame_size, value.gate_weight,
-            frame_size * 3, false, nullptr);
+                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
+                 nullptr);
    }
    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
                         batch_size, active_gate);
    if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value,
+      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
-            frame_size, value.state_weight, frame_size, 1,
+                 value.reset_output_value, frame_size, value.state_weight,
-            value.gate_value + frame_size * 2, frame_size * 3, false, nullptr);
+                 frame_size, 1, value.gate_value + frame_size * 2,
+                 frame_size * 3, false, nullptr);
    }
    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;
  if (trans_a) {
    int numel = matrix_a.numel();
@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
        a[index++] = tmp[i * n + j];
      }
    }
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-              matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+                   matrix_out->data<float>(), N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-          matrix_out->data<float>(), N, relu, bias);
+               matrix_out->data<float>(), N, relu, bias);
 #endif
  } else {
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
-              N, beta, matrix_out->data<float>(), N, relu, bias);
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
+                   N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
-          beta, matrix_out->data<float>(), N, relu, bias);
+               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+               relu, bias);
 #endif
  }
 }
@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                         float alpha, framework::Tensor *matrix_out, float beta,
                         bool relu, framework::Tensor *new_scale,
                         framework::Tensor *new_bias, int group, float *bias) {
+  Gemm gemm;
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
 #ifdef _OPENMP
-  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+  gemm.SgemmWithBn_omp(
-                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-                  relu, new_scale->data<float>() + group,
+      beta, matrix_out->data<float>(), N, relu,
-                  new_bias->data<float>() + group, bias);
+      new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
 #else
-  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+  gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
-              N, beta, matrix_out->data<float>(), N, relu,
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
-              new_scale->data<float>() + group, new_bias->data<float>() + group,
+                   N, relu, new_scale->data<float>() + group,
-              bias);
+                   new_bias->data<float>() + group, bias);
 #endif
 }
 void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,
                     framework::Tensor *matrix_out, float *p, std::string mode,
                     float *bias, float *bias1) {
+  Gemm gemm;
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
 #ifdef _OPENMP
-  SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(),
+  gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
-                     N, matrix_out->data<float>(), N, p, mode, bias, bias1);
+                          matrix_b.data<float>(), N, matrix_out->data<float>(),
+                          N, p, mode, bias, bias1);
 #else
-  SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
-                 matrix_out->data<float>(), N, p, mode, bias, bias1);
+                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
+                      p, mode, bias, bias1);
 #endif
 }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -35,8 +35,8 @@ if (CON GREATER -1)
    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
    # gen test
-    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test_yolo_combined paddle-mobile)
+    target_link_libraries(test-yolo-combined paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()
@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
    target_link_libraries(test-fssd paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
+    target_link_libraries(test-multi-process paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
    }
  }
-  paddle_mobile::operators::math::SgemmWithBn(
+  paddle_mobile::operators::math::Gemm gemm;
-      m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
+  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+                   nullptr);
  int eq = 0;
  int neq = 0;
  for (int i = 0; i < m * n; ++i) {

--- a/test/net/test_multi_inference_predict.cpp
+++ b/test/net/test_multi_inference_predict.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include <thread>  // NOLINT
+#include "../test_helper.h"
+#include "../test_include.h"
+void fun_yolo();
+int fun_mobilenet();
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
+  //  fun_yolo();
+  //  fun_mobilenet();
+  std::thread t1(fun_yolo);
+  std::thread t2(fun_mobilenet);
+  t1.join();
+  t2.join();
+  return 0;
+}
+void fun_yolo() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
+    vector<float> input(input_tensor.data<float>(),
+                        input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "thread 1:   predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+}
+int fun_mobilenet() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    vector<float> input;
+    vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto biggest = max_element(begin(vec_result), end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << distance(begin(vec_result), biggest) << std::endl;
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "thread 2:  predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}