From dcd03dd57ff42ff7a90effebf158528ad4d422e3 Mon Sep 17 00:00:00 2001
From: zhaojiaying01 <zhaojiaying01@baidu.com>
Date: Thu, 23 Aug 2018 11:09:47 +0800
Subject: [PATCH] update unit test and jni

---
 src/jni/paddle_mobile_jni.cpp   |   2 +-
 src/operators/math/gemm.cpp     | 109 +++++++++++++++++++++++++-------
 test/net/test_googlenet.cpp     |   4 +-
 test/net/test_mobilenet+ssd.cpp |   4 +-
 test/net/test_mobilenet.cpp     |   3 +
 test/net/test_squeezenet.cpp    |   4 +-
 test/net/test_yolo.cpp          |   4 +-
 7 files changed, 103 insertions(+), 27 deletions(-)
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index 1b909532e9..0da56305f9 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
   env->DeleteLocalRef(ddims);
   env->ReleaseFloatArrayElements(buf, dataPointer, 0);
   env->DeleteLocalRef(buf);
-  env->DeleteLocalRef(dataPointer);
+//  env->DeleteLocalRef(dataPointer);
 #endif
 
   ANDROIDLOGI("predictImage finished");
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 0fb454c89d..de0a4f6294 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
           : "memory", "v0", "v1");
 #else
       asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
           "vld1.32    {q0, q1},   [%[b0]]         \n\t"
           "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
           : [local_buffer] "+r"(local_buffer)
@@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
           : "memory", "v0", "v1");
 #else
       asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
           "vld1.32    {q0, q1},   [%[b0]]         \n\t"
           "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
           : [local_buffer] "+r"(local_buffer)
@@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
 
   KC = k;
   MC = L1 / (KC * sizeof(float));
@@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
   zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
 
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
@@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
 
   KC = k;
   MC = L1 / (KC * sizeof(float));
@@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
   zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
 
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
@@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
   int max_threads = 1;
 #endif
 
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
   KC = k;
   if (m > n) {
     // 对 A 分块
@@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
   int max_threads = 1;
 #endif
 
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
   KC = k;
   if (m > n) {
     // 对 A 分块
@@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
+  int kc1 = k / 8;
+  int kc2 = k % 8;
   int step = 4 * ldc;
   asm volatile(
       "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]      \n\t"
       "pld        [%[b_ptr]]            \n\t"
-      "pld        [%[a_ptr],  #64]            \n\t"
-      "pld        [%[b_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]      \n\t"
 
       "vmov.f32   q4,     #0.0          \n\t"
       "vmov.f32   q5,     #0.0          \n\t"
@@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "blt        2f                    \n\t"
       "1:                               \n\t"
 
-      //      "pld        [%[a_ptr], #128]       \n\t"
-      //      "pld        [%[b_ptr], #128]       \n\t"
-      //      "pld        [%[a_ptr], #192]       \n\t"
-      //      "pld        [%[b_ptr], #192]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
 
       "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
       "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
@@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       "vmla.f32   q14,  q2,   d2[1]       \n\t"
       "vmla.f32   q15,  q3,   d2[1]       \n\t"
 
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
       "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
       "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
 
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index b98d07ad16..96d9479eab 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -35,7 +35,9 @@ int main() {
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
     // 预热一次
-    auto vec_result = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index ae6c40961c..c7ba847f7e 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -33,7 +33,9 @@ int main() {
     GetInput<float>(g_hand, &input, dims);
 
     // 预热一次
-    auto output = paddle_mobile.Predict(input, dims);
+	  for (int i = 0; i < 10; ++i) {
+		  auto output = paddle_mobile.Predict(input, dims);
+	  }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto output = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 56234c3c72..3fd9ee320b 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -39,6 +39,9 @@ int main() {
     std::cout << " Max element is " << *biggest << " at position "
               << std::distance(std::begin(vec_result), biggest) << std::endl;
 
+	  for (int i = 0; i < 10; ++i) {
+		  auto vec_result = paddle_mobile.Predict(input, dims);
+	  }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 5d89618859..7a5022b356 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -33,7 +33,9 @@ int main() {
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
     // 预热一次
-    paddle_mobile.Predict(input, dims);
+	  for (int i = 0; i < 10; ++i) {
+		  paddle_mobile.Predict(input, dims);
+	  }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       paddle_mobile.Predict(input, dims);
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index ffe3cdc22c..15bb5ea321 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -34,7 +34,9 @@ int main() {
     std::vector<float> input(input_tensor.data<float>(),
                              input_tensor.data<float>() + input_tensor.numel());
     // 预热一次
-    paddle_mobile.Predict(input, dims);
+	  for (int i = 0; i < 10; ++i) {
+		  paddle_mobile.Predict(input, dims);
+	  }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       paddle_mobile.Predict(input, dims);
-- 
GitLab