[cherry-pick to release/1.5] slim threading fix (#18119)

* fix multithreading issue test=develop * rview fixes test=develop * reivew fix: omp->cpu, infernce_api.cc->pybind.cc test=release/1.5

[cherry-pick to release/1.5] slim threading fix (#18119)
* fix multithreading issue test=develop * rview fixes test=develop * reivew fix: omp->cpu, infernce_api.cc->pybind.cc test=release/1.5
51bac347 · Sylwester Fraczek · Tao Luo · 31ef8c1c · 51bac347 · 51bac347
3 changed file
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -44,6 +44,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
@@ -164,6 +165,8 @@ PYBIND11_MODULE(core_noavx, m) {
  BindException(&m);
+  m.def("set_num_threads", &platform::SetNumThreads);
  m.def(
      "_append_python_callable_object_and_return_id",
      [](py::object py_obj) -> size_t {
@@ -283,8 +286,8 @@ PYBIND11_MODULE(core_noavx, m) {
    LoD is short for Level of Details and is usually used for varied sequence
    length. You can skip the following comment if you don't need optional LoD.
-    For example, a LoDTensor X can look like the example below. It contains 
+    For example, a LoDTensor X can look like the example below. It contains
-    2 sequences. The first has length 2 and the second has length 3, as 
+    2 sequences. The first has length 2 and the second has length 3, as
    described by x.lod.
    The first tensor dimension 5=2+3 is calculated from LoD if it's available.
@@ -292,7 +295,7 @@ PYBIND11_MODULE(core_noavx, m) {
    columns, hence [5, 2].
    x.lod  = [[2, 3]]
    x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    x.shape = [5, 2]
@@ -1002,7 +1005,7 @@ All parameter, weight, gradient are variables in Paddle.
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
          arr = fluid.LoDTensorArray()
@@ -1482,14 +1485,14 @@ All parameter, weight, gradient are variables in Paddle.
          "memory_optimize",
          [](const BuildStrategy &self) { return self.memory_optimize_; },
          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; },
-          R"DOC(The type is BOOL, memory opitimize aims to save total memory 
+          R"DOC(The type is BOOL, memory opitimize aims to save total memory
                consumption, set to True to enable it.
-                Memory Optimize is our experimental feature, some variables 
+                Memory Optimize is our experimental feature, some variables
                may be reused/removed by optimize strategy. If you need to
                fetch some variable values when using this feature, please
                set the persistable property of the variables to True.
                Default False)DOC")
      .def_property(
          "is_distribution",

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -3,7 +3,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 function(inference_analysis_python_api_int8_test target model_dir data_dir filename)
    py_test(${target} SRCS ${filename}
-        ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+        ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
        ARGS --infer_model ${model_dir}/model
             --infer_data ${data_dir}/data.bin
             --int8_model_save_path int8_models/${target}

--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
@@ -84,8 +84,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
                while step < num:
                    fp.seek(imgs_offset + img_size * step)
                    img = fp.read(img_size)
-                    img = struct.unpack_from('{}f'.format(img_ch * img_w *
+                    img = struct.unpack_from(
-                                                          img_h), img)
+                        '{}f'.format(img_ch * img_w * img_h), img)
                    img = np.array(img)
                    img.shape = (img_ch, img_w, img_h)
                    fp.seek(labels_offset + label_size * step)
@@ -137,12 +137,14 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
                images = np.array(images).astype('float32')
                labels = np.array([x[1] for x in data]).astype("int64")
                labels = labels.reshape([-1, 1])
+                fluid.core.set_num_threads(int(os.environ['CPU_NUM_THREADS']))
                out = exe.run(inference_program,
                              feed={
                                  feed_target_names[0]: images,
                                  feed_target_names[1]: labels
                              },
                              fetch_list=fetch_targets)
+                fluid.core.set_num_threads(1)
                top1 += np.sum(out[1]) * len(data)
                top5 += np.sum(out[2]) * len(data)
                total_samples += len(data)
@@ -183,8 +185,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
        accuracy_diff_threshold = test_case_args.accuracy_diff_threshold
        _logger.info(
-            'FP32 & INT8 prediction run: batch_size {0}, warmup batch size {1}.'.
+            'FP32 & INT8 prediction run: batch_size {0}, warmup batch size {1}.'
-            format(batch_size, warmup_batch_size))
+            .format(batch_size, warmup_batch_size))
        #warmup dataset, only use the first batch data
        warmup_reader = paddle.batch(