Merge branch 'develop' into develop

4acdfa17 · Ruilong Liu · GitHub · 3de397f3 · 9862e591 · 4acdfa17
8 changed file
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)

    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
    
 - **Mali GPU**


--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -26,9 +26,6 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
 #include <utility>
@@ -407,14 +404,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::SetThreadNum(int num) {
-#ifdef _OPENMP
-  //  omp_set_dynamic(0);
-  omp_set_num_threads(num);
-#endif
-}
-
 template class Executor<CPU, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -58,8 +58,6 @@ class Executor {
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);

-  void SetThreadNum(int num);
-
 protected:
  Executor() = default;
  void InitMemory();

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -16,6 +16,14 @@ limitations under the License. */

 namespace paddle_mobile {

+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+};
+
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
                                  int batch_size) {
@@ -81,7 +89,9 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
 }

 template class PaddleMobile<CPU, Precision::FP32>;
+
 template class PaddleMobile<FPGA, Precision::FP32>;
+
 template class PaddleMobile<GPU_MALI, Precision::FP32>;

 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP

 #include "common/types.h"
 #include "framework/tensor.h"
@@ -44,6 +47,7 @@ class PaddleMobile {
   * */
  bool Load(const std::string &model_path, const std::string &para_path,
            bool optimize = false, int batch_size = 1);
+  void SetThreadNum(int num);

  /*
   * @b to predict

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifndef X86
 #include <arm_neon.h>
 #endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif

 namespace paddle_mobile {
 namespace operators {
@@ -158,6 +161,7 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
                 float beta, float *c, float *C, int ldc, bool relu) {
+#pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
@@ -187,6 +191,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                       const float *b, float beta, float *c, float *C, int ldc,
                       bool relu, float *new_scale, float *new_bias) {
+#pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -17,26 +17,21 @@ limitations under the License. */
 #include "../test_include.h"

 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  bool optimize = true;
  auto time1 = time();
-  //  auto program = loader.Load(g_googlenet, optimize);
-  auto program = loader.Load(g_googlenet_combine + "/model",
-                             g_googlenet_combine + "/params", optimize);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
-  executor.SetThreadNum(4);
-  std::vector<float> input;
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-  auto time3 = time();
-  int count = 1;
-  for (int i = 0; i < count; ++i) {
-    executor.Predict(input, dims);
-  }
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();

-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
  return 0;
 }
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -18,6 +18,7 @@ limitations under the License. */

 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
  if (paddle_mobile.Load(g_mobilenet, true)) {
    auto time2 = time();