提交 de913954 编写于 作者: Z zhaojiaying01

update unit test and jni

上级 25830a06
...@@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( ...@@ -219,7 +219,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
env->DeleteLocalRef(ddims); env->DeleteLocalRef(ddims);
env->ReleaseFloatArrayElements(buf, dataPointer, 0); env->ReleaseFloatArrayElements(buf, dataPointer, 0);
env->DeleteLocalRef(buf); env->DeleteLocalRef(buf);
env->DeleteLocalRef(dataPointer); // env->DeleteLocalRef(dataPointer);
#endif #endif
ANDROIDLOGI("predictImage finished"); ANDROIDLOGI("predictImage finished");
......
...@@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -402,7 +402,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
: "memory", "v0", "v1"); : "memory", "v0", "v1");
#else #else
asm volatile( asm volatile(
"pld [%[b0]] \n\t" // "pld [%[b0]] \n\t"
"vld1.32 {q0, q1}, [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t"
"vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer) : [local_buffer] "+r"(local_buffer)
...@@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -454,7 +454,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
: "memory", "v0", "v1"); : "memory", "v0", "v1");
#else #else
asm volatile( asm volatile(
"pld [%[b0]] \n\t" // "pld [%[b0]] \n\t"
"vld1.32 {q0, q1}, [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t"
"vst1.32 {q0, q1}, [%[local_buffer]]! \n\t" "vst1.32 {q0, q1}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer) : [local_buffer] "+r"(local_buffer)
...@@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2528,7 +2528,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
int L2 = 0.5 * 1024 * 1024; int L2 = 512 * 1024;
KC = k; KC = k;
MC = L1 / (KC * sizeof(float)); MC = L1 / (KC * sizeof(float));
...@@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2552,10 +2552,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
packedC = static_cast<float *>( packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC)); zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
for (int l = 0; l < KC; ++l) {
zero[l] = 0;
}
int mc, nc; int mc, nc;
for (int j = 0; j < n; j += NC) { for (int j = 0; j < n; j += NC) {
...@@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2591,7 +2588,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
int L2 = 0.5 * 1024 * 1024; int L2 = 512 * 1024;
KC = k; KC = k;
MC = L1 / (KC * sizeof(float)); MC = L1 / (KC * sizeof(float));
...@@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2615,10 +2612,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
packedC = static_cast<float *>( packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC)); zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
for (int l = 0; l < KC; ++l) {
zero[l] = 0;
}
int mc, nc; int mc, nc;
for (int j = 0; j < n; j += NC) { for (int j = 0; j < n; j += NC) {
...@@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2658,7 +2652,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
int max_threads = 1; int max_threads = 1;
#endif #endif
int L1 = 32 * 1024; int L1 = 64 / max_threads * 1024;
KC = k; KC = k;
if (m > n) { if (m > n) {
// 对 A 分块 // 对 A 分块
...@@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2765,7 +2759,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
int max_threads = 1; int max_threads = 1;
#endif #endif
int L1 = 32 * 1024; int L1 = 64 / max_threads * 1024;
KC = k; KC = k;
if (m > n) { if (m > n) {
// 对 A 分块 // 对 A 分块
...@@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -2934,14 +2928,14 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
int kc1 = k / 4; int kc1 = k / 8;
int kc2 = k % 4; int kc2 = k % 8;
int step = 4 * ldc; int step = 4 * ldc;
asm volatile( asm volatile(
"pld [%[a_ptr]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t"
"pld [%[b_ptr]] \n\t" "pld [%[b_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t" "pld [%[b_ptr], #64] \n\t"
"pld [%[b_ptr], #64] \n\t"
"vmov.f32 q4, #0.0 \n\t" "vmov.f32 q4, #0.0 \n\t"
"vmov.f32 q5, #0.0 \n\t" "vmov.f32 q5, #0.0 \n\t"
...@@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -2960,10 +2954,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
"blt 2f \n\t" "blt 2f \n\t"
"1: \n\t" "1: \n\t"
// "pld [%[a_ptr], #128] \n\t" "pld [%[a_ptr], #128] \n\t"
// "pld [%[b_ptr], #128] \n\t" "pld [%[b_ptr], #128] \n\t"
// "pld [%[a_ptr], #192] \n\t"
// "pld [%[b_ptr], #192] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
...@@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -2997,6 +2989,79 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
"vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t"
"pld [%[a_ptr], #128] \n\t"
"pld [%[b_ptr], #128] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"pld [%[a_ptr], #128] \n\t"
"pld [%[b_ptr], #128] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"pld [%[a_ptr], #128] \n\t"
"pld [%[b_ptr], #128] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
......
...@@ -35,7 +35,9 @@ int main() { ...@@ -35,7 +35,9 @@ int main() {
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
// 预热一次 // 预热一次
auto vec_result = paddle_mobile.Predict(input, dims); for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
......
...@@ -33,7 +33,9 @@ int main() { ...@@ -33,7 +33,9 @@ int main() {
GetInput<float>(g_hand, &input, dims); GetInput<float>(g_hand, &input, dims);
// 预热一次 // 预热一次
auto output = paddle_mobile.Predict(input, dims); for (int i = 0; i < 10; ++i) {
auto output = paddle_mobile.Predict(input, dims);
}
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
auto output = paddle_mobile.Predict(input, dims); auto output = paddle_mobile.Predict(input, dims);
......
...@@ -39,6 +39,9 @@ int main() { ...@@ -39,6 +39,9 @@ int main() {
std::cout << " Max element is " << *biggest << " at position " std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl; << std::distance(std::begin(vec_result), biggest) << std::endl;
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
......
...@@ -33,7 +33,9 @@ int main() { ...@@ -33,7 +33,9 @@ int main() {
std::vector<float> input(input_tensor.data<float>(), std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel()); input_tensor.data<float>() + input_tensor.numel());
// 预热一次 // 预热一次
paddle_mobile.Predict(input, dims); for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims);
}
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims); paddle_mobile.Predict(input, dims);
......
...@@ -34,7 +34,9 @@ int main() { ...@@ -34,7 +34,9 @@ int main() {
std::vector<float> input(input_tensor.data<float>(), std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel()); input_tensor.data<float>() + input_tensor.numel());
// 预热一次 // 预热一次
paddle_mobile.Predict(input, dims); for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims);
}
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims); paddle_mobile.Predict(input, dims);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册