diff --git a/README.md b/README.md
index 05a109a81791ac85d975138dcd76f7f71716624a..de8fe0bb7f4613bd9d6dfebd82db1d407ee682f4 100644
--- a/README.md
+++ b/README.md
@@ -28,19 +28,19 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 
 |mobilenet arm v7|1线程|2线程|4线程|
 |------------|----|-----|-----|
-|麒麟960(ms)|110.586|70.897|47.474|
+|麒麟960(ms)|110.586|63.285|38.215|
 |||||
 |mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|222.124|138.952|90.856|
+|麒麟960(ms)|220.248|128.473|79.334|
 |||||
 |googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|348.018|240.304|169.998|
+|麒麟960(ms)|341.965|228.724|161.531|
 |||||
 |squeezenet arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|84.685|56.544|38.833|
+|麒麟960(ms)|84.080|55.641|37.182|
 |||||
 |yolo arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|131.831|88.990|60.905|
+|麒麟960(ms)|129.445|80.627|50.936|
 
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
     arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 73e6c9d6f170fc4eebb6af2f8b7a67c847961950..91005287055b7af859d738ea20c40abbf5f7db96 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -192,8 +192,14 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
     }
     *data += (memory_size * sizeof(uint8_t));
   } else {
-    for (int n = 0; n < memory_size * type_size; ++n) {
-      static_cast<char *>(memory)[n] = (*data)[n];
+    for (int n = 0; n < memory_size; n++) {
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
     }
     (*data) += (sizeof(char) * memory_size * type_size);
   }
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 3730cf350a1399e5f3c1473fd1ce8d7b1d13b1b6..0fb454c89d66dabdcdd40c6590120016182c6629 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -2957,8 +2957,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "vmov.f32   q15,    #0.0          \n\t"
 
       "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
 
       //      "pld        [%[a_ptr], #128]       \n\t"
       //      "pld        [%[b_ptr], #128]       \n\t"
@@ -3030,12 +3030,12 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "vmla.f32   q15,  q3,   d2[1]       \n\t"
 
       "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
-      "end_kc1_%=:                        \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
 
       "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        end_kc2_%=              \n\t"
-      "loop_kc2_%=:                       \n\t"
+      "blt        4f                      \n\t"
+      "3:                                 \n\t"
 
       "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
       "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
@@ -3054,8 +3054,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "vmla.f32   q15,  q3,   d2[1]       \n\t"
 
       "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        loop_kc2_%=             \n\t"
-      "end_kc2_%=:                        \n\t"
+      "bge        3b                      \n\t"
+      "4:                                 \n\t"
 
       "mov        r5,     %[c]            \n\t"
       "mov        r6,     %[step]         \n\t"
@@ -3113,8 +3113,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
       "dup      v28.4s,    wzr     \n\t"
 
       "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
 
       "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
       "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
@@ -3149,8 +3149,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
       "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
 
       "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
-      "end_kc1_%=:                        \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
 
       "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
       "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
@@ -3205,8 +3205,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
       "dup      v29.4s,    wzr     \n\t"
 
       "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
 
       "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
       "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
@@ -3245,8 +3245,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
       "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
 
       "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
-      "end_kc1_%=:                        \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
 
       "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
       "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 4c14f63bde40675a7e0016e28d900788431ff2ae..5d89618859d47fd7d61d61871583e1ebbca3db33 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(2);
+  paddle_mobile.SetThreadNum(4);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index 83508cff335c55f5cc416c6652d83706a4626c1a..ffe3cdc22c4f847da2503192660a99f7f6d62e37 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(2);
+  paddle_mobile.SetThreadNum(4);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
index 81400d987195364c06b4b93d0859469b43f90e7b..7764d95ed72da613459233bd55ddcffdc444318f 100644
--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include "../test_include.h"
 #include "operators/fusion_conv_add_bn_relu_op.h"