Merge branch 'develop' into develop

fdbeb280 · eclipsycn · GitHub · 41cf54a2 · 09e82b6d · fdbeb280
8 changed file
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -409,9 +409,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::SetThreadNum(int num) {
-  for (int k = 0; k < std::max(num, 3); ++k) {
-    operators::math::Gemmer::gemmers.push_back(new operators::math::Gemmer());
-  }
 #ifdef _OPENMP
  //  omp_set_dynamic(0);
  omp_set_num_threads(num);

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -14,14 +14,10 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 #pragma once
-#if _OPENMP
-#include <omp.h>
-#endif
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
-#include "operators/math/gemm.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
@@ -110,33 +106,9 @@ void ConvAddBasic(const FusionConvAddParam &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
-      auto dim_a = filter_slice.dims();
+                          static_cast<float>(1), &out_slice,
-      auto dim_b = col_matrix.dims();
+                          static_cast<float>(1));
-      auto dim_out = out_slice.dims();
-      int m = dim_out[0];
-      int n = dim_out[1];
-      int k = dim_a[1];
-      float *output_data = out_slice.data<float>();
-      int thread_num = 4;
-      int m1 = m / thread_num;
-      int m2 = m % thread_num;
-#pragma omp parallel for
-      for (int j = 0; j < thread_num; ++j) {
-        int row_count = m1;
-        if (j == thread_num - 1) {
-          row_count = m1 + m2;
-        }
-        math::Gemmer::gemmers[j]->Sgemm(
-            row_count, n, k, 1, filter_slice.data<float>() + j * m1 * k, k,
-            col_matrix.data<float>(), n, 1, output_data + j * m1 * n, n, false);
-      }
-      //        math::matmul<float>(filter_slice, false, col_matrix, false,
-      //                            static_cast<float>(1), &out_slice,
-      //                            static_cast<float>(1));
    }
  }
 }

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -22,11 +22,17 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+int MC = 0;
-std::vector<Gemmer *> Gemmer::gemmers;
+int KC = 0;
+int NC = 0;
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
 // 将A矩阵分块复制到连续内存(ColMajor)
-void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-                         float *buffer) {
+                 float *buffer) {
  int i, j;
  const float *Aij;
  for (i = 0; i < m - m_tail; i += MR) {
@@ -52,8 +58,8 @@ void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 }
 // 将A矩阵分块复制到连续内存(RowMajor)
-void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer) {
+                  float *buffer) {
  const float *a0, *a1, *a2, *a3;
  for (int i = 0; i < m - m_tail; i += MR) {
    a0 = A + i * lda;
@@ -92,8 +98,8 @@ void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
 }
 // 将B矩阵分块复制到连续内存(ColMajor)
-void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer) {
+                 float *buffer) {
  int i, j;
  const float *Bj, *Bj1, *Bj2, *Bj3;
  for (j = 0; j < n - n_tail; j += NR) {
@@ -121,8 +127,8 @@ void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 }
 // 将B矩阵分块复制到连续内存(RowMajor)
-void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                          float *buffer) {
+                  float *buffer) {
  const float *b0;
  for (int j = 0; j < n - n_tail; j += NR) {
    for (int i = 0; i < k; ++i) {
@@ -150,9 +156,8 @@ void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 }
 // 分块矩阵乘法
-void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                         const float *b, float beta, float *c, float *C,
+                 float beta, float *c, float *C, int ldc, bool relu) {
-                         int ldc, bool relu) {
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -179,10 +184,9 @@ void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
 }
 // 分块矩阵乘法
-void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                               const float *b, float beta, float *c, float *C,
+                       const float *b, float beta, float *c, float *C, int ldc,
-                               int ldc, bool relu, float *new_scale,
+                       bool relu, float *new_scale, float *new_bias) {
-                               float *new_bias) {
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -198,8 +202,7 @@ void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 }
 #if defined(IOS)
-void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
+void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
-                       int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -250,8 +253,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
 }  // namespace math
 #elif defined(ARMV7)
-void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-                       int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -322,8 +324,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
 }
 #else
-void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-                       int ldc) {
  float *c0, *c1, *c2, *c3;
  c0 = c;
  c1 = c + ldc;
@@ -362,9 +363,8 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
 #endif
 // 32位 float 矩阵乘法
-void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-                   const float *B, int ldb, float beta, float *C, int ldc,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
-                   bool relu) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 30 * 1024;
@@ -415,10 +415,9 @@ void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                         int lda, const float *B, int ldb, float beta, float *C,
+                 const float *B, int ldb, float beta, float *C, int ldc,
-                         int ldc, bool relu, float *new_scale,
+                 bool relu, float *new_scale, float *new_bias) {
-                         float *new_bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 30 * 1024;
@@ -469,9 +468,9 @@ void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
  paddle_mobile::memory::Free(zero);
 }
-void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                          int lda, const float *B, int ldb, float beta,
+                  const float *B, int ldb, float beta, float *C, int ldc,
-                          float *C, int ldc, bool relu) {
+                  bool relu) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
@@ -691,10 +690,9 @@ void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
  }
 }
-void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                                const float *A, int lda, const float *B,
+                        int lda, const float *B, int ldb, float beta, float *C,
-                                int ldb, float beta, float *C, int ldc,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
-                                bool relu, float *new_scale, float *new_bias) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
@@ -903,8 +901,7 @@ void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
  }
 }
-void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
-                       int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -1012,7 +1009,7 @@ void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
 }
 // C = A * B
-void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1069,10 +1066,10 @@ void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void Gemmer::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 // C = A * B + C
-void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1136,7 +1133,7 @@ void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B + C, relu(C)
-void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1210,14 +1207,14 @@ void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B, batchnorm(C)
-void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
-                         float *scale, float *bias) {
+                 float *bias) {
-  int nc1 = nc / 16;
+  int volatile nc1 = nc / 16;
  int _nc1 = nc % 16;
-  int nc2 = _nc1 / 4;
+  int volatile nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+  int volatile nc3 = 16 - 4 * (_nc1 % 4);
-  int step = 4 * (ldc - nc);
+  int volatile step = 4 * (ldc - nc);
-  int step1 = 4 * (NC - nc);
+  int volatile step1 = 4 * (NC - nc);
  asm volatile(
      "subs       %[mc], %[mc], #1        \n\t"
@@ -1296,8 +1293,8 @@ void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
 }
 // C = A * B, batchnorm(C), relu(C)
-void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
-                             float *scale, float *bias) {
+                     float *bias) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int nc2 = _nc1 / 4;
@@ -1389,7 +1386,7 @@ void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
 }
 // C = A * B
-void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
+void VecWriteBasic(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;
@@ -1435,10 +1432,10 @@ void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void Gemmer::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
 // C = A * B + C
-void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
@@ -1476,7 +1473,7 @@ void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
 }
 // C = A * B + C, relu(C)
-void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
@@ -1524,8 +1521,8 @@ void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
 }
 // C = A * B, batchnorm(C)
-void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
-                            float *bias) {
+                    float *bias) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;
@@ -1591,8 +1588,8 @@ void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B, batchnorm(C), relu(C)
-void Gemmer::VecWriteWithBnRelu(int n, float *c, float *C, int ldc,
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                                float *scale, float *bias) {
+                        float *bias) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <vector>
 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
@@ -28,111 +27,88 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-struct Gemmer {
-  int MC = 0;
-  int KC = 0;
-  int NC = 0;
-  float *packedA;
+// 将 A 矩阵分块复制到连续内存(ColMajor)
-  float *packedB;
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-  float *packedC;
+                 float *buffer);
-  float *zero;
-  static std::vector<Gemmer *> gemmers;
+// 将 B 矩阵分块复制到连续内存(ColMajor)
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-  // 将 A 矩阵分块复制到连续内存(ColMajor)
+                 float *buffer);
-  void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-                   float *buffer);
+// 将 A 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-  // 将 B 矩阵分块复制到连续内存(ColMajor)
+                  float *buffer);
-  void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                   float *buffer);
+// 将 B 矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-  // 将 A 矩阵分块复制到连续内存(RowMajor)
+                  float *buffer);
-  void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
+// 分块矩阵乘法
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-  // 将 B 矩阵分块复制到连续内存(RowMajor)
+                 float beta, float *c, float *C, int ldc, bool relu);
-  void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                    float *buffer);
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
-  // 分块矩阵乘法
+                       bool relu, float *new_scale, float *new_bias);
-  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                   float beta, float *c, float *C, int ldc, bool relu);
+// 向量矩阵乘法 (M = 1)
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                  const float *B, int ldb, float beta, float *C, int ldc,
-                         const float *b, float beta, float *c, float *C,
+                  bool relu);
-                         int ldc, bool relu, float *new_scale, float *new_bias);
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-  // 向量矩阵乘法 (M = 1)
+                        int lda, const float *B, int ldb, float beta, float *C,
-  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
-                    const float *B, int ldb, float beta, float *C, int ldc,
-                    bool relu);
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-                          int lda, const float *B, int ldb, float beta,
-                          float *C, int ldc, bool relu, float *new_scale,
+// 分块矩阵乘法结果回写
-                          float *new_bias);
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-  // 计算一个更小的 C 矩阵分块
+// C = alpha * A * B + beta * C
-  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
-  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
-  // 分块矩阵乘法结果回写
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B
+// C = A * B, batchnorm(C)
-  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
-  // C = alpha * A * B + beta * C
+// C = A * B, batchnorm(C), relu(C)
-  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
-  // C = A * B + C
-  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// 向量矩阵乘法结果回写
+// C = A * B
-  // C = A * B + C, relu(C)
+void VecWriteBasic(int n, float *c, float *C, int ldc);
-  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-  // C = A * B, batchnorm(C)
+// C = A * B + C
-  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-                   float *new_scale, float *new_bias);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-  // C = A * B, batchnorm(C), relu(C)
+// C = A * B, batchnorm(C)
-  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                       float *new_scale, float *new_bias);
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
-  // 向量矩阵乘法结果回写
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-  // C = A * B
+                        float *new_bias);
-  void VecWriteBasic(int n, float *c, float *C, int ldc);
+// 32位 float 矩阵乘法
-  // C = alpha * A * B + beta * C
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu);
-  // C = A * B + C
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
-  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
-  // C = A * B + C, relu(C)
+                 bool relu, float *new_scale, float *new_bias);
-  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// 64位 double 矩阵乘法
-  // C = A * B, batchnorm(C)
+void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+           const double *B, int ldb, float beta, double *C, int ldc);
-                      float *new_bias);
-  // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                          float *new_bias);
-  // 32位 float 矩阵乘法
-  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-             const float *B, int ldb, float beta, float *C, int ldc, bool relu);
-  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
-  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                   const float *B, int ldb, float beta, float *C, int ldc,
-                   bool relu, float *new_scale, float *new_bias);
-  // 64位 double 矩阵乘法
-  void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-             const double *B, int ldb, float beta, double *C, int ldc);
-};
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -26,14 +26,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
+  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+  //  dim_out.size() ==
+  //  2,
+  //                 "The input and output of matmul be matrix");
+  //
+  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+  //                     platform::is_cpu_place(matrix_b.place())
+  //                     &&
+  //                     platform::is_cpu_place(matrix_out->place()),
+  //                 "Matrix must all be in CPUPlace");
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  Gemmer::gemmers[0]->Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-                            matrix_b.data<float>(), N, beta,
+        beta, matrix_out->data<float>(), N, relu);
-                            matrix_out->data<float>(), N, relu);
 }
 template <>
@@ -45,15 +54,24 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
+  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+  //  dim_out.size() ==
+  //  2,
+  //                 "The input and output of matmul be matrix");
+  //
+  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+  //                     platform::is_cpu_place(matrix_b.place())
+  //                     &&
+  //                     platform::is_cpu_place(matrix_out->place()),
+  //                 "Matrix must all be in CPUPlace");
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  Gemmer::gemmers[0]->SgemmWithBn(
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+              N, beta, matrix_out->data<float>(), N, relu,
-      beta, matrix_out->data<float>(), N, relu, new_scale->data<float>(),
+              new_scale->data<float>(), new_bias->data<float>());
-      new_bias->data<float>());
 }
 }  // namespace math

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #endif
 #include "framework/tensor.h"
 #include "pool_3x3.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
 #include <climits>
@@ -30,7 +30,7 @@ using std::max;
 using std::min;
 using std::vector;
 void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
  const int h_in = input->dims()[2];
@@ -280,7 +280,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
 }
 void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
  const int h_in = input->dims()[2];
@@ -523,7 +523,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -582,7 +582,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = max_value;
          } else {
-#ifdef ARMV7
+#if defined(ARMV7)
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
                "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -622,7 +622,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
 void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -676,7 +676,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = sum / 9.0;
          } else {
-#ifdef ARMV7
+#if defined(ARMV7)
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"

--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON

--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -60,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
    T *output_data = output->mutable_data<T>();
    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
 #pragma omp parallel for
+      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
          int hend = std::min(hstart + ksize_height, input_height);