diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5ccd2dc5ab353b1634b651a4b7caa2af0da75ce4..dd75f0d9a65404908667d873786160ddaa73fa57 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cb14d2a2602808bd35106ed2bafcf7975f549597..c132ce326c6b22ea235e6fb8c570678cb54e22ef 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -35,7 +35,6 @@
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -67,7 +66,7 @@ bool AnalysisPredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (!PrepareScope(parent_scope)) {
     return false;
@@ -160,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+void AnalysisPredictor::SetMkldnnThreadID(int tid) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::set_cur_thread_id(tid);
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+#endif
+}
+
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index cf81b7db738d899566ddf32c5e5a40475c8e7bc7..db57812bc3ba8e6d578e665524cb5749e6bfecd6 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
+  void SetMkldnnThreadID(int tid);
+
  protected:
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
   bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index fcbc3803d04def9a9855f2fee489e7e2c561b454..66a8e513961d74b96a98b01393048112ded65482 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2ac736df7ccd54babe582ca1383903c191069d33..a09bd1cac2aa31b2ecee9b0f77d2b777104f1161 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
                             int max_batch_size = 1);
   bool use_tensorrt() const { return use_tensorrt_; }
 
+  void EnableMKLDNN();
   // NOTE this is just for internal development, please not use it.
   // NOT stable yet.
-  void EnableMKLDNN();
   bool use_mkldnn() const { return use_mkldnn_; }
 
   friend class ::paddle::AnalysisPredictor;
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 0a2a2a1a23401b5aa4d3402da6f7a3369280d8f5..1513a4b3b4f6620aa8bf53e082afa203af15ca13 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
   // Specify the variable's name of each input if input tensors don't follow the
   // `feeds` and `fetches` of the phase `save_inference_model`.
   bool specify_input_name{false};
+
+  // Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+ protected:
+  // number of cpu math library (such as MKL, OpenBlas) threads for each
+  // instance.
+  int cpu_math_library_num_threads_{1};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 2b936175ed3f8ec24826485027048c82df0461ab..abc63577b7913a3c9de7d6c16d8ac3e85ffd7c3c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index aa0c4b1d049bc276cda2f58ac1edd8102fb3fd88..4231eef7220735d0b80eb1adc951c55ff7378f1b 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7b686045a59c93a93322f99c2cdf7050ddbf0a6d..1dc16784067836f173c9c0e0bc30f10c98435ed7 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace inference {
@@ -206,22 +207,23 @@ void TestMultiThreadPrediction(
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
-  for (int tid = 1; tid < num_threads; ++tid) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
+  auto main_predictor = CreateTestPredictor(config, use_analysis);
 
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
-#ifdef PADDLE_WITH_MKLDNN
-      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
-#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
-      auto &predictor = predictors[tid];
+      // To ensure the thread binding correctly,
+      // please clone inside the threadpool.
+      auto predictor = main_predictor->Clone();
+#ifdef PADDLE_WITH_MKLDNN
+      if (use_analysis) {
+        static_cast<AnalysisPredictor *>(predictor.get())
+            ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
+      }
+#endif
 
       // warmup run
       LOG(INFO) << "Running thread " << tid << ", warm up run...";
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index b072b4c20a171d148bd892c162436d03da404fb9..5b9953a5aa9a29bd917d16a16c678fc32a32c18f 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
 
-DECLARE_int32(paddle_num_threads);
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                            .template Get<jitkernel::VAddKernel<T>>(N);
 
 #ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#pragma omp parallel for
 #endif
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index f2d691b2931f5a57e70fd4762e9dea5665ed75c2..b737a6c38d0441cbfcd55ed2c55969ceca68db5d 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-  omp_set_num_threads(num_threads);
+  omp_set_num_threads(real_num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif