diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c942b43f174895d1bfa9688bb6d651f440b9bf41..ca02e38ede7a4b4be2ebe2602fbb720b79aaa741 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <time.h>
+#include <thread>  // NOLINT
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -31,76 +32,74 @@ TEST(inference, understand_sentiment) {
 
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor words;
-  /*
-    paddle::framework::LoD lod{{0, 83}};
-    int64_t word_dict_len = 198392;
-    SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                   static_cast<int64_t>(word_dict_len - 1));
-   */
-  std::vector<int64_t> srcdata{
-      784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
-      198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
-      6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
-      784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209, 198391,
-      6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
-      1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,  198391,
-      6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
-      9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463, 42468,
-      4376,   10251, 10760};
-  paddle::framework::LoD lod{{0, srcdata.size()}};
-  words.set_lod(lod);
-  int64_t* pdata = words.mutable_data<int64_t>(
-      {static_cast<int64_t>(srcdata.size()), 1}, paddle::platform::CPUPlace());
-  memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-  LOG(INFO) << "number of input size:" << words.numel();
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&words);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
   const bool model_combined = false;
-  if (FLAGS_prepare_vars) {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, false, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, false, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  } else {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, true, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, true, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  }
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  int total_work = 100;
+  int num_threads = 10;
+  int work_per_thread = total_work / num_threads;
+  std::vector<std::unique_ptr<std::thread>> infer_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    infer_threads.emplace_back(new std::thread([&, i]() {
+      for (int j = 0; j < work_per_thread; ++j) {
+        // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+        // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+        paddle::framework::LoDTensor words;
+        /*
+          paddle::framework::LoD lod{{0, 83}};
+          int64_t word_dict_len = 198392;
+          SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                         static_cast<int64_t>(word_dict_len - 1));
+         */
+        std::vector<int64_t> srcdata{
+            784,   784,    1550,   6463,   56,     75693,  6189,  784,    784,
+            1550,  198391, 6463,   42468,  4376,   10251,  10760, 6189,   297,
+            396,   6463,   6463,   1550,   198391, 6463,   22564, 1612,   291,
+            68,    164,    784,    784,    1550,   198391, 6463,  13659,  3362,
+            42468, 6189,   2209,   198391, 6463,   2209,   2209,  198391, 6463,
+            2209,  1062,   3029,   1831,   3029,   1065,   2281,  100,    11216,
+            1110,  56,     10869,  9811,   100,    198391, 6463,  100,    9280,
+            100,   288,    40031,  1680,   1335,   100,    1550,  9280,   7265,
+            244,   1550,   198391, 6463,   1550,   198391, 6463,  42468,  4376,
+            10251, 10760};
+        paddle::framework::LoD lod{{0, srcdata.size()}};
+        words.set_lod(lod);
+        int64_t* pdata = words.mutable_data<int64_t>(
+            {static_cast<int64_t>(srcdata.size()), 1},
+            paddle::platform::CPUPlace());
+        memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
 
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+        LOG(INFO) << "number of input size:" << words.numel();
+        std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+        cpu_feeds.push_back(&words);
 
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+        paddle::framework::LoDTensor output1;
+        std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+        cpu_fetchs1.push_back(&output1);
 
-  CheckError<float>(output1, output2);
-#endif
+        // Run inference on CPU
+        if (FLAGS_prepare_vars) {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, false, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, false, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        } else {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, true, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, true, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        }
+        LOG(INFO) << output1.lod();
+        LOG(INFO) << output1.dims();
+      }
+    }));
+  }
 }