Merge branch 'develop' into develop

a12c9bfd · zhangyang0701 · GitHub · 727cc8dc · e62404ec · a12c9bfd
6 changed file
--- a/README.md
+++ b/README.md
@@ -28,19 +28,19 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 |mobilenet arm v7|1线程|2线程|4线程|
 |------------|----|-----|-----|
-|麒麟960(ms)|110.586|70.897|47.474|
+|麒麟960(ms)|110.586|63.285|38.215|
 |||||
 |mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|222.124|138.952|90.856|
+|麒麟960(ms)|220.248|128.473|79.334|
 |||||
 |googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|348.018|240.304|169.998|
+|麒麟960(ms)|341.965|228.724|161.531|
 |||||
 |squeezenet arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|84.685|56.544|38.833|
+|麒麟960(ms)|84.080|55.641|37.182|
 |||||
 |yolo arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|131.831|88.990|60.905|
+|麒麟960(ms)|129.445|80.627|50.936|
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -192,8 +192,14 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
-    for (int n = 0; n < memory_size * type_size; ++n) {
+    for (int n = 0; n < memory_size; n++) {
-      static_cast<char *>(memory)[n] = (*data)[n];
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -2957,8 +2957,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      "vmov.f32   q15,    #0.0          \n\t"
      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
+      "blt        2f                    \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "1:                               \n\t"
      //      "pld        [%[a_ptr], #128]       \n\t"
      //      "pld        [%[b_ptr], #128]       \n\t"
@@ -3030,12 +3030,12 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      "vmla.f32   q15,  q3,   d2[1]       \n\t"
      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
+      "bge        1b                      \n\t"
-      "end_kc1_%=:                        \n\t"
+      "2:                                 \n\t"
      "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        end_kc2_%=              \n\t"
+      "blt        4f                      \n\t"
-      "loop_kc2_%=:                       \n\t"
+      "3:                                 \n\t"
      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
@@ -3054,8 +3054,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      "vmla.f32   q15,  q3,   d2[1]       \n\t"
      "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        loop_kc2_%=             \n\t"
+      "bge        3b                      \n\t"
-      "end_kc2_%=:                        \n\t"
+      "4:                                 \n\t"
      "mov        r5,     %[c]            \n\t"
      "mov        r6,     %[step]         \n\t"
@@ -3113,8 +3113,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
      "dup      v28.4s,    wzr     \n\t"
      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
+      "blt        2f                    \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "1:                               \n\t"
      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
@@ -3149,8 +3149,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
+      "bge        1b                      \n\t"
-      "end_kc1_%=:                        \n\t"
+      "2:                                 \n\t"
      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
@@ -3205,8 +3205,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
      "dup      v29.4s,    wzr     \n\t"
      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        end_kc1_%=            \n\t"
+      "blt        2f                    \n\t"
-      "loop_kc1_%=:                     \n\t"
+      "1:                               \n\t"
      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
@@ -3245,8 +3245,8 @@ void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        loop_kc1_%=             \n\t"
+      "bge        1b                      \n\t"
-      "end_kc1_%=:                        \n\t"
+      "2:                                 \n\t"
      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"

--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(2);
+  paddle_mobile.SetThreadNum(4);
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();

--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(2);
+  paddle_mobile.SetThreadNum(4);
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();

--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <iostream>
 #include "../test_include.h"
 #include "operators/fusion_conv_add_bn_relu_op.h"