diff --git a/README.md b/README.md
index c29165d57204561e702997187d55e6cf869c4b39..825c417c5fb400614ad7805f6ebdf8222d846134 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,12 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 |||||
 |googlenet(v1) arm v7|1线程|2线程|4线程|
 |麒麟960(ms)|348.018|242.689|169.998|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|84.685|56.544|38.833|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|131.831|88.990|60.905|
 
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
     arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index cda97ba1a342e5b9451fd8363643f638792e3579..e8a9498819cae330abbd4a007a6510d89f167114 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/conv_add_relu_kernel.h"
@@ -65,11 +67,11 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 
 #ifdef PADDLE_MOBILE_CPU
 
-//#ifndef CONV_ADD_RELU_REGISTER
-//#define CONV_ADD_RELU_REGISTER
-// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
-// FusionConvAddReluOpMatcher());
-//#endif
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
 
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
index c505c61fce21775136a368949a451999b97b3069..386c09d71a3d5709842991bffd2e8ea039edc940 100644
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <iostream>
 #include "../test_helper.h"
+#include "../test_include.h"
 #include "operators/math/gemm.h"
 #include "operators/math/math_function.h"
 
@@ -26,6 +27,8 @@ limitations under the License. */
 #define k 1024
 
 int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   Tensor aa, bb, cc, scale, bias;
   auto aaptr = aa.mutable_data<float>({m, k});
   auto bbptr = bb.mutable_data<float>({k, n});
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index 883ad95392ad351a2634e1a56ac050f02d8767e6..73ac88ef77b0c02545ef55b6493d4681c61c192d 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, false)) {
+  if (paddle_mobile.Load(g_resnet, true)) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
     std::vector<int64_t> dims{1, 3, 32, 32};
     Tensor input_tensor;
     SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
@@ -29,10 +30,15 @@ int main() {
 
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
-    auto time3 = time();
+    // 预热一次
     paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
     auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
   }
 
   return 0;
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 39d4687ff3de37c571ee89213485fb0b6bc939df..4c14f63bde40675a7e0016e28d900788431ff2ae 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  if (paddle_mobile.Load(g_squeezenet, false)) {
+  if (paddle_mobile.Load(g_squeezenet, true)) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
     std::vector<int64_t> dims{1, 3, 227, 227};
     Tensor input_tensor;
     SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
@@ -31,10 +32,15 @@ int main() {
 
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
-    auto time3 = time();
+    // 预热一次
     paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
     auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
 
   return 0;
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index 65dec59ad0579d362c75ae6ec1d362fb957d4fc5..83508cff335c55f5cc416c6652d83706a4626c1a 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  if (paddle_mobile.Load(g_yolo, false)) {
+  if (paddle_mobile.Load(g_yolo, true)) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
 
     std::vector<int64_t> dims{1, 3, 227, 227};
     Tensor input_tensor;
@@ -32,10 +33,15 @@ int main() {
 
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
-    auto time3 = time();
+    // 预热一次
     paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
     auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
   return 0;
 }