diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec3abdb86724fb16725962958c5cf55c..e8224be2d495dafba46ce4bbb9537e8dcc993a8c 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
@@ -40,6 +41,7 @@ TEST(inference, fit_a_line) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
+  LOG(INFO) << "--- CPU Runs: ---";
   TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
   LOG(INFO) << output1.dims();
 
@@ -49,9 +51,73 @@ TEST(inference, fit_a_line) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
+  LOG(INFO) << "--- CPU Runs: ---";
   TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
 #endif
 }
+
+TEST(multi_thread_inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int num_threads = 2;
+
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
+  cpu_feeds.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* input = new paddle::framework::LoDTensor();
+    // The second dim of the input tensor should be 13
+    // The input data should be >= 0
+    int64_t batch_size = 10;
+    SetupTensor<float>(*input,
+                       {batch_size, 13},
+                       static_cast<float>(0),
+                       static_cast<float>(10));
+    cpu_feeds[i].push_back(input);
+  }
+
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+  cpu_fetchs1.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* output = new paddle::framework::LoDTensor();
+    cpu_fetchs1[i].push_back(output);
+  }
+
+  // Run inference on CPU
+  LOG(INFO) << "--- CPU Runs (Multi Thread): ---";
+  TestMultiThreadInference<paddle::platform::CPUPlace>(
+      dirname, cpu_feeds, cpu_fetchs1, num_threads);
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+  cpu_fetchs2.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* output = new paddle::framework::LoDTensor();
+    cpu_fetchs2[i].push_back(output);
+  }
+
+  // Run inference on CUDA GPU
+  LOG(INFO) << "--- GPU Runs (Multi Thread): ---";
+  TestMultiThreadInference<paddle::platform::CUDAPlace>(
+      dirname, cpu_feeds, cpu_fetchs2, num_threads);
+
+  for (int i = 0; i < num_threads; ++i) {
+    delete cpu_fetchs2[i][0];
+  }
+#endif
+
+  for (int i = 0; i < num_threads; ++i) {
+    delete cpu_feeds[i][0];
+    delete cpu_fetchs1[i][0];
+  }
+}
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..54e203833bae19e68ad0fc992906d49d5acd0141
--- /dev/null
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+
+void ThreadedRunInference(
+    std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    paddle::framework::Executor& executor,
+    paddle::framework::Scope* scope,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+    std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+}
+
+template <typename Place>
+void TestMultiThreadInference(
+    const std::string& dirname,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
+    std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
+    const int num_threads) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
+      paddle::inference::Load(executor, *scope, dirname);
+
+  std::vector<std::thread*> threads;
+  for (int i = 0; i < num_threads; ++i) {
+    threads.push_back(new std::thread(ThreadedRunInference,
+                                      std::ref(inference_program),
+                                      std::ref(executor),
+                                      scope,
+                                      std::ref(cpu_feeds[i]),
+                                      std::ref(cpu_fetchs[i])));
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i]->join();
+    delete threads[i];
+  }
+
+  delete scope;
+}