Merge pull request #12397 from tensor-tang/refine/num_threads

refine num_threads control

Merge pull request #12397 from tensor-tang/refine/num_threads
refine num_threads control
5bea9c14 · tensor-tang · GitHub · cca3bd6e · 4f0383f5 · 5bea9c14
4 changed file
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 inline double GetCurrentMs() {
  struct timeval time;
@@ -160,12 +158,7 @@ TEST(inference, nlp) {
  std::unique_ptr<paddle::framework::Scope> scope(
      new paddle::framework::Scope());
-#ifdef PADDLE_WITH_MKLML
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
  double start_ms = 0, stop_ms = 0;
  if (FLAGS_num_threads > 1) {

--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
  PADDLE_ENFORCE(false, "To be implemented.");
 #endif

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
 namespace paddle {
 namespace framework {
@@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 }

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -123,7 +123,7 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')