diff --git a/README.md b/README.md
index 91b4f886a31bb839a7e513185464260b1e95c453..69362734116fd8af78442a07dd31600aa46b7935 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
 
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
     
 - **Mali GPU**
 
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 510fc8d7db0d957b619e4dbeb25fc14b9768327c..480f48290cc1bbf4888832d76187a13a4915ec40 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -26,9 +26,6 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
 #include <utility>
@@ -407,14 +404,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
   return result_vector;
 }
 
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::SetThreadNum(int num) {
-#ifdef _OPENMP
-  //  omp_set_dynamic(0);
-  omp_set_num_threads(num);
-#endif
-}
-
 template class Executor<CPU, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
diff --git a/src/io/executor.h b/src/io/executor.h
index 28b0d65181355fd76e4ec09aa5964130aee2ab68..f8f2a8ad5657fdb3cf6cb249e32537bd5e866913 100644
--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -58,8 +58,6 @@ class Executor {
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
-  void SetThreadNum(int num);
-
  protected:
   Executor() = default;
   void InitMemory();
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 3d5735f8da66db6f4b5f139f8261a4cd9cf0f796..cabdd799a0e7d561d8bc56c0913f1389c38f8907 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -16,6 +16,14 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+};
+
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
                                   int batch_size) {
@@ -81,7 +89,9 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
 }
 
 template class PaddleMobile<CPU, Precision::FP32>;
+
 template class PaddleMobile<FPGA, Precision::FP32>;
+
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 3ce39e0ae1ffc7e193f6f4308a911875fdf95076..74c11471566c3db8a37ea2d62e0496e5d40cb3b7 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 
 #include "common/types.h"
 #include "framework/tensor.h"
@@ -44,6 +47,7 @@ class PaddleMobile {
    * */
   bool Load(const std::string &model_path, const std::string &para_path,
             bool optimize = false, int batch_size = 1);
+  void SetThreadNum(int num);
 
   /*
    * @b to predict
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index d7bc60e3c85f5ce5b5165d4d37caf397651eeccd..bb91adcc4db412db137fdc12831bad75e069e38c 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifndef X86
 #include <arm_neon.h>
 #endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -158,6 +161,7 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
                  float beta, float *c, float *C, int ldc, bool relu) {
+#pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -187,6 +191,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                        const float *b, float beta, float *c, float *C, int ldc,
                        bool relu, float *new_scale, float *new_bias) {
+#pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 1851f2668dee3a10e72b5dbeeadb9f51827a2729..2ab24736397c1e71350335561abbcabcba6e27a4 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -17,26 +17,21 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   bool optimize = true;
   auto time1 = time();
-  //  auto program = loader.Load(g_googlenet, optimize);
-  auto program = loader.Load(g_googlenet_combine + "/model",
-                             g_googlenet_combine + "/params", optimize);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
-  executor.SetThreadNum(4);
-  std::vector<float> input;
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-  auto time3 = time();
-  int count = 1;
-  for (int i = 0; i < count; ++i) {
-    executor.Predict(input, dims);
-  }
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();
 
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
   return 0;
 }
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 1f38dc5d19d0e7bb54faf75a41419941e8b1f412..2e285695fb79f3ed5471a653c71a10b36ef4e7f2 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
   if (paddle_mobile.Load(g_mobilenet, true)) {
     auto time2 = time();