diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e8964c4acea0d220deca048a018eb7de42d7e4e5..467d4411376381df950bb582f9c73410284a5e2d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
   VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 97c164bdef7a4b3e66be78526793f3830ada398b..048286a843f0190a8139cb86eda4f3a3a40d89a1 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                 std::vector<PaddleTensor> *output_data,
                                 int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
   VLOG(3) << "Predictor::predict";
   Timer timer;
   timer.tic();
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index c27c39f40a2067dd2bd2150e4b1e53eab7cdf06e..36282b3efe5756da55b056c09e94aa352e3dcf8a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
 #define NEW_TENSOR(name__) \
   auto name__##_tensor = predictor->GetInputTensor(#name__);
 
-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
   double total_time_of_threads{0};
   std::vector<std::thread> threads;
 
   for (int tid = 0; tid < FLAGS_num_threads; tid++) {
     threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
       NEW_TENSOR(data_lod_attention);
       NEW_TENSOR(cell_init);
       NEW_TENSOR(data);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index bd0059e18485c046df27d5ddbb39df9bbb249113..cca2ab1ee148b568e714c24dded7cd72403f0e5f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
   SetConfig(&config);
   config.SwitchUseFeedFetchOps(false);
 
-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
   double total_time_of_threads{0};
   std::vector<std::thread> threads;
 
   for (int tid = 0; tid < FLAGS_num_threads; tid++) {
     threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
       std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
       PrepareZeroCopyInputs(predictor, &inputs);
       auto output_tensor = predictor->GetOutputTensor(out_var_name);
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 2811eb4946ea025cf6c7ab197c4e603df86f6f2d..2e53fddfe7f6f0c5b31ff069fb1661f143022841 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -17,8 +17,10 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <vector>
 #ifdef WITH_GPERFTOOLS
 #include <gperftools/profiler.h>
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
-  auto main_predictor = CreateTestPredictor(config, use_analysis);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  for (int tid = 1; tid < num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
 
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = main_predictor->Clone();
+      auto &predictor = predictors[tid];
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
         static_cast<AnalysisPredictor *>(predictor.get())