Merge pull request #822 from smilejames/develop

update unit test and jni

Merge pull request #822 from smilejames/develop
update unit test and jni
6cb30b4d · xiebaiyuan · GitHub · 25830a06 · 347655b9 · 6cb30b4d
7 changed file
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
  env->DeleteLocalRef(ddims);
  env->ReleaseFloatArrayElements(buf, dataPointer, 0);
  env->DeleteLocalRef(buf);
-  env->DeleteLocalRef(dataPointer);
+//  env->DeleteLocalRef(dataPointer);
 #endif
  ANDROIDLOGI("predictImage finished");

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
          : "memory", "v0", "v1");
 #else
      asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
          : [local_buffer] "+r"(local_buffer)
@@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
          : "memory", "v0", "v1");
 #else
      asm volatile(
-          "pld        [%[b0]]                     \n\t"
+          //          "pld        [%[b0]]                     \n\t"
          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
          : [local_buffer] "+r"(local_buffer)
@@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
  KC = k;
  MC = L1 / (KC * sizeof(float));
@@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  packedC = static_cast<float *>(
      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
@@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
+  int L2 = 512 * 1024;
  KC = k;
  MC = L1 / (KC * sizeof(float));
@@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  packedC = static_cast<float *>(
      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
@@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
  int max_threads = 1;
 #endif
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
  KC = k;
  if (m > n) {
    // 对 A 分块
@@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
  int max_threads = 1;
 #endif
-  int L1 = 32 * 1024;
+  int L1 = 64 / max_threads * 1024;
  KC = k;
  if (m > n) {
    // 对 A 分块
@@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
  const float *a_ptr, *b_ptr;
  a_ptr = a;
  b_ptr = b;
-  int kc1 = k / 4;
+  int kc1 = k / 8;
-  int kc2 = k % 4;
+  int kc2 = k % 8;
  int step = 4 * ldc;
  asm volatile(
      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]      \n\t"
      "pld        [%[b_ptr]]            \n\t"
-      "pld        [%[a_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]      \n\t"
-      "pld        [%[b_ptr],  #64]            \n\t"
      "vmov.f32   q4,     #0.0          \n\t"
      "vmov.f32   q5,     #0.0          \n\t"
@@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      "blt        2f                    \n\t"
      "1:                               \n\t"
-      //      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
-      //      "pld        [%[b_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
-      //      "pld        [%[a_ptr], #192]       \n\t"
-      //      "pld        [%[b_ptr], #192]       \n\t"
      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
@@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      "vmla.f32   q14,  q2,   d2[1]       \n\t"
      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -35,7 +35,9 @@ int main() {
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
    // 预热一次
-    auto vec_result = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);

--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -33,7 +33,9 @@ int main() {
    GetInput<float>(g_hand, &input, dims);
    // 预热一次
-    auto output = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      auto output = paddle_mobile.Predict(input, dims);

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -39,6 +39,9 @@ int main() {
    std::cout << " Max element is " << *biggest << " at position "
              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);

--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -33,7 +33,9 @@ int main() {
    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
    // 预热一次
-    paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);

--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -34,7 +34,9 @@ int main() {
    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
    // 预热一次
-    paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);