diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp index 1b909532e96d4337d620fb0b7cf562ee35a3dc72..0da56305f978dc874666a2be26c15a9de47b3757 100644 --- a/src/jni/paddle_mobile_jni.cpp +++ b/src/jni/paddle_mobile_jni.cpp @@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( env->DeleteLocalRef(ddims); env->ReleaseFloatArrayElements(buf, dataPointer, 0); env->DeleteLocalRef(buf); - env->DeleteLocalRef(dataPointer); +// env->DeleteLocalRef(dataPointer); #endif ANDROIDLOGI("predictImage finished"); diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 0fb454c89d66dabdcdd40c6590120016182c6629..de0a4f6294420402199654b51e19fd1b7d0eadae 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, : "memory", "v0", "v1"); #else asm volatile( - "pld [%[b0]] \n\t" + // "pld [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" : [local_buffer] "+r"(local_buffer) @@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, : "memory", "v0", "v1"); #else asm volatile( - "pld [%[b0]] \n\t" + // "pld [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" : [local_buffer] "+r"(local_buffer) @@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; + int L2 = 512 * 1024; KC = k; MC = L1 / (KC * sizeof(float)); @@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } + memset(static_cast(zero), 0, sizeof(float) * KC); int mc, nc; for (int j = 0; j < n; j += NC) { @@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; + int L2 = 512 * 1024; KC = k; MC = L1 / (KC * sizeof(float)); @@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } + memset(static_cast(zero), 0, sizeof(float) * KC); int mc, nc; for (int j = 0; j < n; j += NC) { @@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, int max_threads = 1; #endif - int L1 = 32 * 1024; + int L1 = 64 / max_threads * 1024; KC = k; if (m > n) { // 对 A 分块 @@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, int max_threads = 1; #endif - int L1 = 32 * 1024; + int L1 = 64 / max_threads * 1024; KC = k; if (m > n) { // 对 A 分块 @@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { const float *a_ptr, *b_ptr; a_ptr = a; b_ptr = b; - int kc1 = k / 4; - int kc2 = k % 4; + int kc1 = k / 8; + int kc2 = k % 8; int step = 4 * ldc; asm volatile( "pld [%[a_ptr]] \n\t" + "pld [%[a_ptr], #64] \n\t" "pld [%[b_ptr]] \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #64] \n\t" + "pld [%[b_ptr], #64] \n\t" "vmov.f32 q4, #0.0 \n\t" "vmov.f32 q5, #0.0 \n\t" @@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "blt 2f \n\t" "1: \n\t" - // "pld [%[a_ptr], #128] \n\t" - // "pld [%[b_ptr], #128] \n\t" - // "pld [%[a_ptr], #192] \n\t" - // "pld [%[b_ptr], #192] \n\t" + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" @@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "pld [%[a_ptr], #128] \n\t" + "pld [%[b_ptr], #128] \n\t" + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index b98d07ad16dcb15268e4638f9144bde36a1005a8..96d9479eab07a4beb1631949e822529640e99d87 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -35,7 +35,9 @@ int main() { std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); // 预热一次 - auto vec_result = paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp index ae6c40961ca96ea032b1822f17a663baedc8f661..0bb6fea0e7bb5bd8d3154bd1a6ea21e42582aaa4 100644 --- a/test/net/test_mobilenet+ssd.cpp +++ b/test/net/test_mobilenet+ssd.cpp @@ -33,7 +33,9 @@ int main() { GetInput(g_hand, &input, dims); // 预热一次 - auto output = paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + auto output = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto output = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 56234c3c72b58869775238d78875c8bd3b94cf7c..fe7b431caf07ae260e60dbe2fdc8765eecd43f2f 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -39,6 +39,9 @@ int main() { std::cout << " Max element is " << *biggest << " at position " << std::distance(std::begin(vec_result), biggest) << std::endl; + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp index 5d89618859d47fd7d61d61871583e1ebbca3db33..3ed0299d9df69bb9c77e078f2f42ee90f0667b1e 100644 --- a/test/net/test_squeezenet.cpp +++ b/test/net/test_squeezenet.cpp @@ -33,7 +33,9 @@ int main() { std::vector input(input_tensor.data(), input_tensor.data() + input_tensor.numel()); // 预热一次 - paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { paddle_mobile.Predict(input, dims); diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp index ffe3cdc22c4f847da2503192660a99f7f6d62e37..bb566d8b8e4907f8b979222d36f421914cca50bf 100644 --- a/test/net/test_yolo.cpp +++ b/test/net/test_yolo.cpp @@ -34,7 +34,9 @@ int main() { std::vector input(input_tensor.data(), input_tensor.data() + input_tensor.numel()); // 预热一次 - paddle_mobile.Predict(input, dims); + for (int i = 0; i < 10; ++i) { + paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { paddle_mobile.Predict(input, dims);