diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index 1b909532e96d4337d620fb0b7cf562ee35a3dc72..0da56305f978dc874666a2be26c15a9de47b3757 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
   env->DeleteLocalRef(ddims);
   env->ReleaseFloatArrayElements(buf, dataPointer, 0);
   env->DeleteLocalRef(buf);
-  env->DeleteLocalRef(dataPointer);
+//  env->DeleteLocalRef(dataPointer);
 #endif
 
   ANDROIDLOGI("predictImage finished");
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 0fb454c89d66dabdcdd40c6590120016182c6629..de0a4f6294420402199654b51e19fd1b7d0eadae 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
           : "memory", "v0", "v1");
 #else
       asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
           "vld1.32    {q0, q1},   [%[b0]]         \n\t"
           "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
           : [local_buffer] "+r"(local_buffer)
@@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
           : "memory", "v0", "v1");
 #else
       asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
           "vld1.32    {q0, q1},   [%[b0]]         \n\t"
           "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
           : [local_buffer] "+r"(local_buffer)
@@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
 
   KC = k;
   MC = L1 / (KC * sizeof(float));
@@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
   zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
 
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
@@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
 
   KC = k;
   MC = L1 / (KC * sizeof(float));
@@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
   zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
 
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
@@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
   int max_threads = 1;
 #endif
 
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
   KC = k;
   if (m > n) {
     // 对 A 分块
@@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
   int max_threads = 1;
 #endif
 
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
   KC = k;
   if (m > n) {
     // 对 A 分块
@@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
+  int kc1 = k / 8;
+  int kc2 = k % 8;
   int step = 4 * ldc;
   asm volatile(
       "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]      \n\t"
       "pld        [%[b_ptr]]            \n\t"
-      "pld        [%[a_ptr],  #64]            \n\t"
-      "pld        [%[b_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]      \n\t"
 
       "vmov.f32   q4,     #0.0          \n\t"
       "vmov.f32   q5,     #0.0          \n\t"
@@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "blt        2f                    \n\t"
       "1:                               \n\t"
 
-      //      "pld        [%[a_ptr], #128]       \n\t"
-      //      "pld        [%[b_ptr], #128]       \n\t"
-      //      "pld        [%[a_ptr], #192]       \n\t"
-      //      "pld        [%[b_ptr], #192]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
 
       "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
       "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
@@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "vmla.f32   q14,  q2,   d2[1]       \n\t"
       "vmla.f32   q15,  q3,   d2[1]       \n\t"
 
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
       "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
       "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
 
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index b98d07ad16dcb15268e4638f9144bde36a1005a8..96d9479eab07a4beb1631949e822529640e99d87 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -35,7 +35,9 @@ int main() {
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
     // 预热一次
-    auto vec_result = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index ae6c40961ca96ea032b1822f17a663baedc8f661..0bb6fea0e7bb5bd8d3154bd1a6ea21e42582aaa4 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -33,7 +33,9 @@ int main() {
     GetInput<float>(g_hand, &input, dims);
 
     // 预热一次
-    auto output = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto output = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 56234c3c72b58869775238d78875c8bd3b94cf7c..fe7b431caf07ae260e60dbe2fdc8765eecd43f2f 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -39,6 +39,9 @@ int main() {
     std::cout << " Max element is " << *biggest << " at position "
               << std::distance(std::begin(vec_result), biggest) << std::endl;
 
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 5d89618859d47fd7d61d61871583e1ebbca3db33..3ed0299d9df69bb9c77e078f2f42ee90f0667b1e 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -33,7 +33,9 @@ int main() {
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
     // 预热一次
-    paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index ffe3cdc22c4f847da2503192660a99f7f6d62e37..bb566d8b8e4907f8b979222d36f421914cca50bf 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -34,7 +34,9 @@ int main() {
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
     // 预热一次
-    paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       paddle_mobile.Predict(input, dims);