From 51bac34750e996b68e78fc562fc2a71a8e1ea39b Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Sat, 15 Jun 2019 17:01:22 +0200
Subject: [PATCH] [cherry-pick to release/1.5] slim threading fix (#18119)

* fix multithreading issue

test=develop

* rview fixes

test=develop

* reivew fix: omp->cpu, infernce_api.cc->pybind.cc

test=release/1.5
---
 paddle/fluid/pybind/pybind.cc                 | 19 +++++++++++--------
 .../fluid/contrib/slim/tests/CMakeLists.txt   |  2 +-
 .../test_mkldnn_int8_quantization_strategy.py | 10 ++++++----
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1f9c5a679b..b0030d010f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -44,6 +44,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
@@ -164,6 +165,8 @@ PYBIND11_MODULE(core_noavx, m) {
 
   BindException(&m);
 
+  m.def("set_num_threads", &platform::SetNumThreads);
+
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -283,8 +286,8 @@ PYBIND11_MODULE(core_noavx, m) {
     LoD is short for Level of Details and is usually used for varied sequence
     length. You can skip the following comment if you don't need optional LoD.
 
-    For example, a LoDTensor X can look like the example below. It contains 
-    2 sequences. The first has length 2 and the second has length 3, as 
+    For example, a LoDTensor X can look like the example below. It contains
+    2 sequences. The first has length 2 and the second has length 3, as
     described by x.lod.
 
     The first tensor dimension 5=2+3 is calculated from LoD if it's available.
@@ -292,7 +295,7 @@ PYBIND11_MODULE(core_noavx, m) {
     columns, hence [5, 2].
 
     x.lod  = [[2, 3]]
-     
+
     x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
 
     x.shape = [5, 2]
@@ -1002,7 +1005,7 @@ All parameter, weight, gradient are variables in Paddle.
 
     Examples:
         .. code-block:: python
-        
+
           import paddle.fluid as fluid
 
           arr = fluid.LoDTensorArray()
@@ -1482,14 +1485,14 @@ All parameter, weight, gradient are variables in Paddle.
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
           [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; },
-          R"DOC(The type is BOOL, memory opitimize aims to save total memory 
+          R"DOC(The type is BOOL, memory opitimize aims to save total memory
                 consumption, set to True to enable it.
-                
-                Memory Optimize is our experimental feature, some variables 
+
+                Memory Optimize is our experimental feature, some variables
                 may be reused/removed by optimize strategy. If you need to
                 fetch some variable values when using this feature, please
                 set the persistable property of the variables to True.
-                
+
                 Default False)DOC")
       .def_property(
           "is_distribution",
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 23607d5052..e61e93da3f 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -3,7 +3,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 function(inference_analysis_python_api_int8_test target model_dir data_dir filename)
     py_test(${target} SRCS ${filename}
-        ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+        ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
         ARGS --infer_model ${model_dir}/model
              --infer_data ${data_dir}/data.bin
              --int8_model_save_path int8_models/${target}
diff --git a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
index 01c8f893fa..f1ebb8ae72 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
@@ -84,8 +84,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
                 while step < num:
                     fp.seek(imgs_offset + img_size * step)
                     img = fp.read(img_size)
-                    img = struct.unpack_from('{}f'.format(img_ch * img_w *
-                                                          img_h), img)
+                    img = struct.unpack_from(
+                        '{}f'.format(img_ch * img_w * img_h), img)
                     img = np.array(img)
                     img.shape = (img_ch, img_w, img_h)
                     fp.seek(labels_offset + label_size * step)
@@ -137,12 +137,14 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
                 images = np.array(images).astype('float32')
                 labels = np.array([x[1] for x in data]).astype("int64")
                 labels = labels.reshape([-1, 1])
+                fluid.core.set_num_threads(int(os.environ['CPU_NUM_THREADS']))
                 out = exe.run(inference_program,
                               feed={
                                   feed_target_names[0]: images,
                                   feed_target_names[1]: labels
                               },
                               fetch_list=fetch_targets)
+                fluid.core.set_num_threads(1)
                 top1 += np.sum(out[1]) * len(data)
                 top5 += np.sum(out[2]) * len(data)
                 total_samples += len(data)
@@ -183,8 +185,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
         accuracy_diff_threshold = test_case_args.accuracy_diff_threshold
 
         _logger.info(
-            'FP32 & INT8 prediction run: batch_size {0}, warmup batch size {1}.'.
-            format(batch_size, warmup_batch_size))
+            'FP32 & INT8 prediction run: batch_size {0}, warmup batch size {1}.'
+            .format(batch_size, warmup_batch_size))
 
         #warmup dataset, only use the first batch data
         warmup_reader = paddle.batch(
-- 
GitLab