diff --git a/README.md b/README.md index a5f35f764b1b86c0eb3982a10f8dd699af077880..fd5222655821e36fe194225a4d71a3b60b8a89d5 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,42 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** - arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 - arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 +|mobilenet arm v7|1线程|2线程|4线程| +|------------|----|-----|-----| +|麒麟970(ms)|108.180|63.935|37.545| +|麒麟960(ms)|108.588|63.073|36.822| +|高通845(ms)|85.952|48.890|28.641| +|高通835(ms)|105.434|62.752|37.131| +||||| +|mobilenetssd arm v7|1线程|2线程|4线程| +|麒麟970(ms)|212.686|127.205|77.485| +|麒麟960(ms)|212.641|125.338|75.250| +|高通845(ms)|182.863|95.671|56.857| +|高通835(ms)|213.849|127.717|77.006| +||||| +|googlenet(v1) arm v7|1线程|2线程|4线程| +|麒麟970(ms)|335.288|234.559|161.295| +|麒麟960(ms)|354.443|232.642|157.815| +|高通845(ms)|282.007|173.146|122.148| +|高通835(ms)|341.250|233.354|158.554| +||||| +|squeezenet arm v7|1线程|2线程|4线程| +|麒麟970(ms)|83.726|57.944|36.923| +|麒麟960(ms)|85.835|55.762|36.496| +|高通845(ms)|71.301|41.618|28.785| +|高通835(ms)|82.407|56.176|36.455| +||||| +|yolo arm v7|1线程|2线程|4线程| +|麒麟970(ms)|129.658|79.993|49.969| +|麒麟960(ms)|130.208|78.791|48.390| +|高通845(ms)|109.244|61.736|40.600| +|高通835(ms)|130.402|80.863|50.359| + + 测试机型信息: + 麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4) + 麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4) + 骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4) + 骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4) - **Mali GPU** diff --git a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h index 51dac36aee0634ff0fb2b4bc1bcc45f663f84462..343e5f147644cc5bb86c2929d4bd35b44301c4cf 100644 --- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h @@ -96,10 +96,6 @@ void ConvTransposeCompute(const ConvTransposeParam ¶m) { math::matmul(filter_slice, true, in_slice, false, static_cast

(1.0), &col_matrix, static_cast

(0.0)); - //初始化out_slice的值为0 - float *tmp = out_slice.data(); - int numel = out_slice.numel(); - memset(static_cast(tmp), 0, sizeof(float) * numel); if (data_dim == 2U) { col2im(col, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index cb4d02f45c4229475d5c8ce896427a9aad228ef6..4065f7d9c4934bce8285ea99fe4f14c4e2cc990c 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -481,6 +481,7 @@ class Col2ImFunctor { T *im_data = im->data(); const T *col_data = col.data(); + memset(static_cast(im_data), 0, sizeof(T) * im->numel()); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width;