diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index fe29a4ae58e8df1a5b052f678c111964d60c9ac3..4288081be72c44c0fc3584b50c41a270eac9e204 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -53,10 +53,24 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
+  // The output variable of feed_op is referenced as feed_target.
+  // This function is used to collect the output variable's name of all
+  // feed_ops.
   const std::vector<std::string> GetFeedTargetNames();
+
+  // The input variable of fetch_op is referenced as fetch_target.
+  // This function is used to collect the input variable's name of all
+  // fetch_ops.
   const std::vector<std::string> GetFetchTargetNames();
 
+  // The input variable of feed_op that holds input Tensor provided by users is
+  // referenced as feed_holder.
+  // This function is used to change or unify the feed_holder variables' name.
   void SetFeedHolderName(const std::string &feed_holder_name);
+
+  // The output variable of fetch_op that holds output Tensor needed by users is
+  // referenced as fetch_holder.
+  // This function is used to change or unify the fetch_holder variables' name.
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
  private:
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 7ad7278706b8add70afea23c713e3f37aceabeae..2c5b66a32903f4ffdedb074b31aec53ae6cacaf3 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -27,96 +27,63 @@ TEST(inference, fit_a_line) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  paddle::framework::LoDTensor input;
-  // The second dim of the input tensor should be 13
-  // The input data should be >= 0
-  int64_t batch_size = 10;
-  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
-                     static_cast<float>(10));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
+  for (int num_threads : {1, 2}) {
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
+    cpu_feeds.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* input = new paddle::framework::LoDTensor();
+      // The second dim of the input tensor should be 13
+      // The input data should be >= 0
+      int64_t batch_size = 10;
+      SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
+                         static_cast<float>(10));
+      cpu_feeds[i].push_back(input);
+    }
+
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+    cpu_fetchs1.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs1[i].push_back(output);
+    }
+
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
+                                                cpu_fetchs1[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CPUPlace>(
+          dirname, cpu_feeds, cpu_fetchs1, num_threads);
+    }
 
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+    cpu_fetchs2.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs2[i].push_back(output);
+    }
+
+    // Run inference on CUDA GPU
+    LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
+                                                 cpu_fetchs2[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CUDAPlace>(
+          dirname, cpu_feeds, cpu_fetchs2, num_threads);
+    }
+
+    for (int i = 0; i < num_threads; ++i) {
+      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
+      delete cpu_fetchs2[i][0];
+    }
 #endif
-}
-
-TEST(multi_thread_inference, fit_a_line) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  int num_threads = 2;
-
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
-  cpu_feeds.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* input = new paddle::framework::LoDTensor();
-    // The second dim of the input tensor should be 13
-    // The input data should be >= 0
-    int64_t batch_size = 10;
-    SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
-                       static_cast<float>(10));
-    cpu_feeds[i].push_back(input);
-  }
-
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
-  cpu_fetchs1.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* output = new paddle::framework::LoDTensor();
-    cpu_fetchs1[i].push_back(output);
-  }
-
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs (Multi Thread): ---";
-  TestMultiThreadInference<paddle::platform::CPUPlace>(
-      dirname, cpu_feeds, cpu_fetchs1, num_threads);
-
-#ifdef PADDLE_WITH_CUDA
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
-  cpu_fetchs2.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* output = new paddle::framework::LoDTensor();
-    cpu_fetchs2[i].push_back(output);
-  }
-
-  // Run inference on CUDA GPU
-  LOG(INFO) << "--- GPU Runs (Multi Thread): ---";
-  TestMultiThreadInference<paddle::platform::CUDAPlace>(
-      dirname, cpu_feeds, cpu_fetchs2, num_threads);
-
-  for (int i = 0; i < num_threads; ++i) {
-    CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
-    delete cpu_fetchs2[i][0];
-  }
-#endif
-
-  for (int i = 0; i < num_threads; ++i) {
-    delete cpu_feeds[i][0];
-    delete cpu_fetchs1[i][0];
-  }
+    for (int i = 0; i < num_threads; ++i) {
+      delete cpu_feeds[i][0];
+      delete cpu_fetchs1[i][0];
+    }
+  }  // num_threads-loop
 }
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
index 405e9edb4a598f1af46085e1ef654dc5c2d7506b..56745f115db231d4350da72b7de7967175ac9fe8 100644
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -56,7 +56,7 @@ void ThreadedRunInference(
   }
 
   // 6. Run the inference program
-  executor->Run(*copy_program, scope, feed_targets, fetch_targets,
+  executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
                 feed_holder_name, fetch_holder_name);
 }