Merge remote-tracking branch 'upstream/develop' into develop

f9ea0707 · zhangyang · 4ec9cefb · 63236f42 · f9ea0707 · f9ea0707
6 changed file
--- a/README.md
+++ b/README.md
@@ -35,6 +35,12 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 |||||
 |googlenet(v1) arm v7|1线程|2线程|4线程|
 |麒麟960(ms)|348.018|240.304|169.998|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|84.685|56.544|38.833|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|131.831|88.990|60.905|
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #pragma once
+#include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/conv_add_relu_kernel.h"
@@ -65,11 +67,11 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 #ifdef PADDLE_MOBILE_CPU
-//#ifndef CONV_ADD_RELU_REGISTER
+#ifndef CONV_ADD_RELU_REGISTER
-//#define CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
-// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-// FusionConvAddReluOpMatcher());
+    new FusionConvAddReluOpMatcher());
-//#endif
+#endif
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <iostream>
 #include "../test_helper.h"
+#include "../test_include.h"
 #include "operators/math/gemm.h"
 #include "operators/math/math_function.h"
@@ -26,6 +27,8 @@ limitations under the License. */
 #define k 1024
 int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  Tensor aa, bb, cc, scale, bias;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});

--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, false)) {
+  if (paddle_mobile.Load(g_resnet, true)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 32, 32};
    Tensor input_tensor;
    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
@@ -29,10 +30,15 @@ int main() {
    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);
+    }
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
  }
  return 0;

--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  if (paddle_mobile.Load(g_squeezenet, false)) {
+  if (paddle_mobile.Load(g_squeezenet, true)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 227, 227};
    Tensor input_tensor;
    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
@@ -31,10 +32,15 @@ int main() {
    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);
+    }
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
  }
  return 0;

--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  if (paddle_mobile.Load(g_yolo, false)) {
+  if (paddle_mobile.Load(g_yolo, true)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 227, 227};
    Tensor input_tensor;
@@ -32,10 +33,15 @@ int main() {
    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);
+    }
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
  }
  return 0;
 }