Merge pull request #542 from cocodark/develop

accelerate with openmp

Merge pull request #542 from cocodark/develop
accelerate with openmp
1cff3bfe · WangLiu · GitHub · 866ab5fc · f312f389 · 1cff3bfe
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "io/executor.h"
+#include <operators/math/gemm.h>
 #include <algorithm>
 #include <vector>
 #include "common/enforce.h"
@@ -25,6 +26,9 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
 #include <utility>
@@ -403,6 +407,17 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::SetThreadNum(int num) {
+  for (int k = 0; k < std::max(num, 3); ++k) {
+    operators::math::Gemmer::gemmers.push_back(new operators::math::Gemmer());
+  }
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+}
 template class Executor<CPU, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -58,6 +58,8 @@ class Executor {
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
+  void SetThreadNum(int num);
 protected:
  Executor() = default;
  void InitMemory();

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -14,10 +14,14 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 #pragma once
+#if _OPENMP
+#include <omp.h>
+#endif
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/gemm.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
@@ -106,9 +110,33 @@ void ConvAddBasic(const FusionConvAddParam &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
+      auto dim_a = filter_slice.dims();
-                          static_cast<float>(1));
+      auto dim_b = col_matrix.dims();
+      auto dim_out = out_slice.dims();
+      int m = dim_out[0];
+      int n = dim_out[1];
+      int k = dim_a[1];
+      float *output_data = out_slice.data<float>();
+      int thread_num = 4;
+      int m1 = m / thread_num;
+      int m2 = m % thread_num;
+#pragma omp parallel for
+      for (int j = 0; j < thread_num; ++j) {
+        int row_count = m1;
+        if (j == thread_num - 1) {
+          row_count = m1 + m2;
+        }
+        math::Gemmer::gemmers[j]->Sgemm(
+            row_count, n, k, 1, filter_slice.data<float>() + j * m1 * k, k,
+            col_matrix.data<float>(), n, 1, output_data + j * m1 * n, n, false);
+      }
+      //        math::matmul<float>(filter_slice, false, col_matrix, false,
+      //                            static_cast<float>(1), &out_slice,
+      //                            static_cast<float>(1));
    }
  }
 }

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef LRN_OP
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -47,6 +49,7 @@ struct LRNFunctor {
    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
    for (int a = 0; a < N; a++) {
+#pragma parallel for
      for (int b = 0; b < C; b++) {
        for (int index = start; index < end; index++) {
          int channel = b + index;

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -22,16 +22,10 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-int MC = 0;
-int KC = 0;
+std::vector<Gemmer *> Gemmer::gemmers;
-int NC = 0;
-float *packedA;
-float *packedB;
-float *packedC;
-float *zero;
 // 将A矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
+void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                         float *buffer) {
  int i, j;
  const float *Aij;
@@ -58,7 +52,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 }
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                          float *buffer) {
  const float *a0, *a1, *a2, *a3;
  for (int i = 0; i < m - m_tail; i += MR) {
@@ -98,7 +92,7 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
 }
 // 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
+void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                         float *buffer) {
  int i, j;
  const float *Bj, *Bj1, *Bj2, *Bj3;
@@ -127,7 +121,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 }
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                          float *buffer) {
  const float *b0;
  for (int j = 0; j < n - n_tail; j += NR) {
@@ -156,8 +150,9 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 }
 // 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
-                 float beta, float *c, float *C, int ldc, bool relu) {
+                         const float *b, float beta, float *c, float *C,
+                         int ldc, bool relu) {
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -184,9 +179,10 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 }
 // 分块矩阵乘法
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
+                               const float *b, float beta, float *c, float *C,
-                       bool relu, float *new_scale, float *new_bias) {
+                               int ldc, bool relu, float *new_scale,
+                               float *new_bias) {
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -202,7 +198,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 }
 #if defined(IOS)
-void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
+void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
+                       int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -253,7 +250,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
 }  // namespace math
 #elif defined(ARMV7)
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
+                       int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -324,7 +322,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
 }
 #else
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
+                       int ldc) {
  float *c0, *c1, *c2, *c3;
  c0 = c;
  c1 = c + ldc;
@@ -363,8 +362,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
 #endif
 // 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
+                   const float *B, int ldb, float beta, float *C, int ldc,
+                   bool relu) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 30 * 1024;
@@ -415,9 +415,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
-                 const float *B, int ldb, float beta, float *C, int ldc,
+                         int lda, const float *B, int ldb, float beta, float *C,
-                 bool relu, float *new_scale, float *new_bias) {
+                         int ldc, bool relu, float *new_scale,
+                         float *new_bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 30 * 1024;
@@ -468,9 +469,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
-                  const float *B, int ldb, float beta, float *C, int ldc,
+                          int lda, const float *B, int ldb, float beta,
-                  bool relu) {
+                          float *C, int ldc, bool relu) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
@@ -690,9 +691,10 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
-                        int lda, const float *B, int ldb, float beta, float *C,
+                                const float *A, int lda, const float *B,
-                        int ldc, bool relu, float *new_scale, float *new_bias) {
+                                int ldb, float beta, float *C, int ldc,
+                                bool relu, float *new_scale, float *new_bias) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
@@ -901,7 +903,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
  }
 }
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
+                       int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
@@ -1009,7 +1012,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
 }
 // C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1066,10 +1069,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemmer::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 // C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1133,7 +1136,7 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int step = 4 * ldc;
@@ -1207,8 +1210,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 // C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                 float *bias) {
+                         float *scale, float *bias) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int nc2 = _nc1 / 4;
@@ -1293,8 +1296,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *bias) {
+                             float *scale, float *bias) {
  int nc1 = nc / 16;
  int _nc1 = nc % 16;
  int nc2 = _nc1 / 4;
@@ -1386,7 +1389,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc) {
+void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;
@@ -1432,10 +1435,10 @@ void VecWriteBasic(int n, float *c, float *C, int ldc) {
 }
 // C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+void Gemmer::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
 // C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
@@ -1473,7 +1476,7 @@ void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
 }
 // C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
@@ -1521,7 +1524,7 @@ void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
 }
 // C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
                            float *bias) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
@@ -1588,8 +1591,8 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
 }
 // C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+void Gemmer::VecWriteWithBnRelu(int n, float *c, float *C, int ldc,
-                        float *bias) {
+                                float *scale, float *bias) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <vector>
 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
@@ -27,88 +28,111 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+struct Gemmer {
-// 将 A 矩阵分块复制到连续内存(ColMajor)
+  int MC = 0;
-void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
+  int KC = 0;
+  int NC = 0;
+  float *packedA;
+  float *packedB;
+  float *packedC;
+  float *zero;
+  static std::vector<Gemmer *> gemmers;
+  // 将 A 矩阵分块复制到连续内存(ColMajor)
+  void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                   float *buffer);
-// 将 B 矩阵分块复制到连续内存(ColMajor)
+  // 将 B 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                   float *buffer);
-// 将 A 矩阵分块复制到连续内存(RowMajor)
+  // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+  void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                    float *buffer);
-// 将 B 矩阵分块复制到连续内存(RowMajor)
+  // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+  void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                    float *buffer);
-// 分块矩阵乘法
+  // 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
                   float beta, float *c, float *C, int ldc, bool relu);
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
+                         const float *b, float beta, float *c, float *C,
-                       bool relu, float *new_scale, float *new_bias);
+                         int ldc, bool relu, float *new_scale, float *new_bias);
-// 向量矩阵乘法 (M = 1)
+  // 向量矩阵乘法 (M = 1)
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                    const float *B, int ldb, float beta, float *C, int ldc,
                    bool relu);
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                        int lda, const float *B, int ldb, float beta, float *C,
+                          int lda, const float *B, int ldb, float beta,
-                        int ldc, bool relu, float *new_scale, float *new_bias);
+                          float *C, int ldc, bool relu, float *new_scale,
-// 计算一个更小的 C 矩阵分块
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-// 分块矩阵乘法结果回写
-// C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
                          float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+  // 计算一个更小的 C 矩阵分块
+  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+  // 分块矩阵乘法结果回写
+  // C = A * B
+  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C
+  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B, batchnorm(C)
+  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
                   float *new_scale, float *new_bias);
-// 向量矩阵乘法结果回写
+  // C = A * B, batchnorm(C), relu(C)
-// C = A * B
+  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-void VecWriteBasic(int n, float *c, float *C, int ldc);
+                       float *new_scale, float *new_bias);
-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+  // 向量矩阵乘法结果回写
-// C = A * B + C
+  // C = A * B
-void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+  void VecWriteBasic(int n, float *c, float *C, int ldc);
-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
-// C = A * B, batchnorm(C)
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
                      float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
                          float *new_bias);
-// 32位 float 矩阵乘法
+  // 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
             const float *B, int ldb, float beta, float *C, int ldc, bool relu);
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
                   bool relu, float *new_scale, float *new_bias);
-// 64位 double 矩阵乘法
+  // 64位 double 矩阵乘法
-void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
+  void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
             const double *B, int ldb, float beta, double *C, int ldc);
+};
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -26,23 +26,14 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+  Gemmer::gemmers[0]->Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
-        beta, matrix_out->data<float>(), N, relu);
+                            matrix_b.data<float>(), N, beta,
+                            matrix_out->data<float>(), N, relu);
 }
 template <>
@@ -54,24 +45,15 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+  Gemmer::gemmers[0]->SgemmWithBn(
-              N, beta, matrix_out->data<float>(), N, relu,
+      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-              new_scale->data<float>(), new_bias->data<float>());
+      beta, matrix_out->data<float>(), N, relu, new_scale->data<float>(),
+      new_bias->data<float>());
 }
 }  // namespace math

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef POOL_OP
 #pragma once
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"

--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "pooling.h"
 #include "common/types.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
@@ -57,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
    T *output_data = output->mutable_data<T>();
    for (int i = 0; i < batch_size; i++) {
-      //  #pragma omp parallel for
      for (int c = 0; c < output_channels; ++c) {
+#pragma omp parallel for
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
          int hend = std::min(hstart + ksize_height, input_height);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -26,16 +26,17 @@ int main() {
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
+  executor.SetThreadNum(4);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
+  int count = 1;
-  for (int i = 0; i < 10; ++i) {
+  for (int i = 0; i < count; ++i) {
    executor.Predict(input, dims);
  }
  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
+  DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
  return 0;
 }