accelerate with openmp

9367ccbd · wangliu · 3419d2fb · 9367ccbd · 9367ccbd · 9367ccbd
8 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)

 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "io/executor.h"
+#include <operators/math/gemm.h>
 #include <algorithm>
 #include <vector>
 #include "common/enforce.h"
@@ -25,6 +26,9 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
 #include <utility>
@@ -403,6 +407,14 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }

+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+}
+
 template class Executor<CPU, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -58,6 +58,8 @@ class Executor {
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);

+  void SetThreadNum(int num);
+
 protected:
  Executor() = default;
  void InitMemory();

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef LRN_OP
-
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/operator.h"
 #include "operators/op_param.h"

@@ -47,6 +49,7 @@ struct LRNFunctor {
    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);

    for (int a = 0; a < N; a++) {
+#pragma parallel for
      for (int b = 0; b < C; b++) {
        for (int index = start; index < end; index++) {
          int channel = b + index;

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #ifdef POOL_OP

 #pragma once
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON


--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -14,10 +14,11 @@ limitations under the License. */

 #ifdef POOL_OP

-#include "operators/math/pooling.h"
-#include <algorithm>
-#include <vector>
+#include "pooling.h"
 #include "common/types.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif

 namespace paddle_mobile {
 namespace operators {
@@ -59,7 +60,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
    T *output_data = output->mutable_data<T>();

    for (int i = 0; i < batch_size; i++) {
-      //  #pragma omp parallel for
+#pragma omp parallel for
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -26,16 +26,17 @@ int main() {
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
+  executor.SetThreadNum(4);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
-
-  for (int i = 0; i < 10; ++i) {
+  int count = 1;
+  for (int i = 0; i < count; ++i) {
    executor.Predict(input, dims);
  }

  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
+  DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
  return 0;
 }