From dcd03dd57ff42ff7a90effebf158528ad4d422e3 Mon Sep 17 00:00:00 2001 From: zhaojiaying01 Date: Thu, 23 Aug 2018 11:09:47 +0800 Subject: [PATCH] update unit test and jni --- src/jni/paddle_mobile_jni.cpp | 2 +- src/operators/math/gemm.cpp | 109 +++++++++++++++++++++++++------- test/net/test_googlenet.cpp | 4 +- test/net/test_mobilenet+ssd.cpp | 4 +- test/net/test_mobilenet.cpp | 3 + test/net/test_squeezenet.cpp | 4 +- test/net/test_yolo.cpp | 4 +- 7 files changed, 103 insertions(+), 27 deletions(-) diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp index 1b909532e9..0da56305f9 100644 --- a/src/jni/paddle_mobile_jni.cpp +++ b/src/jni/paddle_mobile_jni.cpp @@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( env->DeleteLocalRef(ddims); env->ReleaseFloatArrayElements(buf, dataPointer, 0); env->DeleteLocalRef(buf); - env->DeleteLocalRef(dataPointer); +// env->DeleteLocalRef(dataPointer); #endif ANDROIDLOGI("predictImage finished"); diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 0fb454c89d..de0a4f6294 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, : "memory", "v0", "v1"); #else asm volatile( - "pld [%[b0]] \n\t" + // "pld [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" : [local_buffer] "+r"(local_buffer) @@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, : "memory", "v0", "v1"); #else asm volatile( - "pld [%[b0]] \n\t" + // "pld [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" : [local_buffer] "+r"(local_buffer) @@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; + int L2 = 512 * 1024; KC = k; MC = L1 / (KC * sizeof(float)); @@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } + memset(static_cast(zero), 0, sizeof(float) * KC); int mc, nc; for (int j = 0; j < n; j += NC) { @@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; + int L2 = 512 * 1024; KC = k; MC = L1 / (KC * sizeof(float)); @@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } + memset(static_cast(zero), 0, sizeof(float) * KC); int mc, nc; for (int j = 0; j < n; j += NC) { @@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, int max_threads = 1; #endif - int L1 = 32 * 1024; + int L1 = 64 / max_threads * 1024; KC = k; if (m > n) { // 对 A 分块 @@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, int max_threads = 1; #endif - int L1 = 32 * 1024; + int L1 = 64 / max_threads * 1024; KC = k; if (m > n) { // 对 A 分块 @@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { const float *a_ptr, *b_ptr; a_ptr = a; b_ptr = b; - int kc1 = k / 4; - int kc2 = k % 4; + int kc1 = k / 8; + int kc2 = k % 8; int step = 4 * ldc; asm volatile( "pld [%[a_ptr]] \n\t" + "pld [%[a_ptr], #64] \n\t" "pld [%[b_ptr]] \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #64] \n\t" + "pld [%[b_ptr], #64] \n\t" "vmov.f32 q4, #0.0 \n\t" "vmov.f32 q5, #0.0 \n\t" @@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "blt 2f \n\t" "1: \n\t" - // "pld [%[a_ptr], #128] \n\t" - // "pld [%[b_ptr], #128] \n\t" - // "pld [%[a_ptr], #192] \n\t" - // "pld [%[b_ptr], #192] \n\t" + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" @@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index b98d07ad16..96d9479eab 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -35,7 +35,9 @@ int main() { std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); // 预热一次 - auto vec_result = paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp index ae6c40961c..c7ba847f7e 100644 --- a/test/net/test_mobilenet+ssd.cpp +++ b/test/net/test_mobilenet+ssd.cpp @@ -33,7 +33,9 @@ int main() { GetInput(g_hand, &input, dims); // 预热一次 - auto output = paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + auto output = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto output = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 56234c3c72..3fd9ee320b 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -39,6 +39,9 @@ int main() { std::cout << " Max element is " << *biggest << " at position " << std::distance(std::begin(vec_result), biggest) << std::endl; + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp index 5d89618859..7a5022b356 100644 --- a/test/net/test_squeezenet.cpp +++ b/test/net/test_squeezenet.cpp @@ -33,7 +33,9 @@ int main() { std::vector input(input_tensor.data(), input_tensor.data() + input_tensor.numel()); // 预热一次 - paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { paddle_mobile.Predict(input, dims); diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp index ffe3cdc22c..15bb5ea321 100644 --- a/test/net/test_yolo.cpp +++ b/test/net/test_yolo.cpp @@ -34,7 +34,9 @@ int main() { std::vector input(input_tensor.data(), input_tensor.data() + input_tensor.numel()); // 预热一次 - paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { paddle_mobile.Predict(input, dims); -- GitLab