diff --git a/README.md b/README.md index 05a109a81791ac85d975138dcd76f7f71716624a..de8fe0bb7f4613bd9d6dfebd82db1d407ee682f4 100644 --- a/README.md +++ b/README.md @@ -28,19 +28,19 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 |mobilenet arm v7|1线程|2线程|4线程| |------------|----|-----|-----| -|麒麟960(ms)|110.586|70.897|47.474| +|麒麟960(ms)|110.586|63.285|38.215| ||||| |mobilenetssd arm v7|1线程|2线程|4线程| -|麒麟960(ms)|222.124|138.952|90.856| +|麒麟960(ms)|220.248|128.473|79.334| ||||| |googlenet(v1) arm v7|1线程|2线程|4线程| -|麒麟960(ms)|348.018|240.304|169.998| +|麒麟960(ms)|341.965|228.724|161.531| ||||| |squeezenet arm v7|1线程|2线程|4线程| -|麒麟960(ms)|84.685|56.544|38.833| +|麒麟960(ms)|84.080|55.641|37.182| ||||| |yolo arm v7|1线程|2线程|4线程| -|麒麟960(ms)|131.831|88.990|60.905| +|麒麟960(ms)|129.445|80.627|50.936| arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 73e6c9d6f170fc4eebb6af2f8b7a67c847961950..91005287055b7af859d738ea20c40abbf5f7db96 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -192,8 +192,14 @@ void Executor::LoadMemory(const framework::VarDesc var_desc, } *data += (memory_size * sizeof(uint8_t)); } else { - for (int n = 0; n < memory_size * type_size; ++n) { - static_cast(memory)[n] = (*data)[n]; + for (int n = 0; n < memory_size; n++) { + float value; + memcpy(&value, *data + n * type_size, type_size); + if (value < 1e-30 && value > -1e-30) { + static_cast(memory)[n] = 0.0; + } else { + static_cast(memory)[n] = value; + } } (*data) += (sizeof(char) * memory_size * type_size); } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 3730cf350a1399e5f3c1473fd1ce8d7b1d13b1b6..0fb454c89d66dabdcdd40c6590120016182c6629 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -2957,8 +2957,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "vmov.f32 q15, #0.0 \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" + "blt 2f \n\t" + "1: \n\t" // "pld [%[a_ptr], #128] \n\t" // "pld [%[b_ptr], #128] \n\t" @@ -3030,12 +3030,12 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "vmla.f32 q15, q3, d2[1] \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" + "bge 1b \n\t" + "2: \n\t" "subs %[kc2], %[kc2], #1 \n\t" - "blt end_kc2_%= \n\t" - "loop_kc2_%=: \n\t" + "blt 4f \n\t" + "3: \n\t" "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" @@ -3054,8 +3054,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { "vmla.f32 q15, q3, d2[1] \n\t" "subs %[kc2], %[kc2], #1 \n\t" - "bge loop_kc2_%= \n\t" - "end_kc2_%=: \n\t" + "bge 3b \n\t" + "4: \n\t" "mov r5, %[c] \n\t" "mov r6, %[step] \n\t" @@ -3113,8 +3113,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { "dup v28.4s, wzr \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" + "blt 2f \n\t" + "1: \n\t" "prfm pldl1keep, [%[a_ptr], #32] \n\t" "prfm pldl1keep, [%[b_ptr], #48] \n\t" @@ -3149,8 +3149,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { "fmla v28.4s, v4.4s, v1.s[3] \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" + "bge 1b \n\t" + "2: \n\t" "st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step] \n\t" "st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step] \n\t" @@ -3205,8 +3205,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) { "dup v29.4s, wzr \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" + "blt 2f \n\t" + "1: \n\t" "prfm pldl1keep, [%[a_ptr], #24] \n\t" "prfm pldl1keep, [%[b_ptr], #64] \n\t" @@ -3245,8 +3245,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) { "fmla v29.4s, v5.4s, v1.s[1] \n\t" "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" + "bge 1b \n\t" + "2: \n\t" "st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step] \n\t" "st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp index 4c14f63bde40675a7e0016e28d900788431ff2ae..5d89618859d47fd7d61d61871583e1ebbca3db33 100644 --- a/test/net/test_squeezenet.cpp +++ b/test/net/test_squeezenet.cpp @@ -18,7 +18,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(2); + paddle_mobile.SetThreadNum(4); // ../../../test/models/googlenet // ../../../test/models/mobilenet auto time1 = time(); diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp index 83508cff335c55f5cc416c6652d83706a4626c1a..ffe3cdc22c4f847da2503192660a99f7f6d62e37 100644 --- a/test/net/test_yolo.cpp +++ b/test/net/test_yolo.cpp @@ -18,7 +18,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(2); + paddle_mobile.SetThreadNum(4); // ../../../test/models/googlenet // ../../../test/models/mobilenet auto time1 = time(); diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp index 81400d987195364c06b4b93d0859469b43f90e7b..7764d95ed72da613459233bd55ddcffdc444318f 100644 --- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp +++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "../test_include.h" #include "operators/fusion_conv_add_bn_relu_op.h"