diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index c89dd41e0a6283e0723e2925f28c0372cda6a2b2..ab7f55337488f9e4c953210124e47c12e26ed6b1 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -26,14 +26,20 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 
+extern std::string paddle::framework::DataTypeToString(
+    const framework::proto::VarType::Type type);
+
 namespace paddle {
 namespace inference {
 
+using paddle::framework::DataTypeToString;
+
 // Timer for timer
 class Timer {
  public:
@@ -267,17 +273,20 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
 }
 
 static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-                      double batch_latency, int epoch = 1) {
+                      double batch_latency, int epoch = 1,
+                      const framework::proto::VarType::Type data_type =
+                          framework::proto::VarType::FP32) {
   PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size.");
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
-  LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch
+  LOG(INFO) << "====== batch size: " << batch_size << ", iterations: " << epoch
             << ", repetitions: " << repeat << " ======";
   LOG(INFO) << "====== batch latency: " << batch_latency
             << "ms, number of samples: " << batch_size * epoch
             << ", sample latency: " << sample_latency
-            << "ms, fps: " << 1000.f / sample_latency << " ======";
+            << "ms, fps: " << 1000.f / sample_latency
+            << ", data type: " << DataTypeToString(data_type) << " ======";
 }
 
 static bool IsFileExists(const std::string &path) {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 10fc7556994b93776ed15184ba17820cebae07a0..a50810948ff8cb9e0bb92c287a7ab3945d39e089 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -65,6 +65,8 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace inference {
 
+using paddle::framework::proto::VarType;
+
 template <typename T>
 constexpr paddle::PaddleDType GetPaddleDType();
 
@@ -293,7 +295,8 @@ void ConvertPaddleTensorToZeroCopyTensor(
 void PredictionWarmUp(PaddlePredictor *predictor,
                       const std::vector<std::vector<PaddleTensor>> &inputs,
                       std::vector<std::vector<PaddleTensor>> *outputs,
-                      int num_threads, int tid) {
+                      int num_threads, int tid,
+                      const VarType::Type data_type = VarType::FP32) {
   int batch_size = FLAGS_batch_size;
   LOG(INFO) << "Running thread " << tid << ", warm up run...";
   if (FLAGS_zero_copy) {
@@ -307,7 +310,7 @@ void PredictionWarmUp(PaddlePredictor *predictor,
   } else {
     predictor->ZeroCopyRun();
   }
-  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type);
   if (FLAGS_profile) {
     paddle::platform::ResetProfiler();
   }
@@ -316,7 +319,8 @@ void PredictionWarmUp(PaddlePredictor *predictor,
 void PredictionRun(PaddlePredictor *predictor,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
                    std::vector<std::vector<PaddleTensor>> *outputs,
-                   int num_threads, int tid) {
+                   int num_threads, int tid,
+                   const VarType::Type data_type = VarType::FP32) {
   int num_times = FLAGS_repeat;
   int iterations = inputs.size();  // process the whole dataset ...
   if (FLAGS_iterations > 0 &&
@@ -355,7 +359,7 @@ void PredictionRun(PaddlePredictor *predictor,
 
   auto batch_latency = elapsed_time / (iterations * num_times);
   PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
-            iterations);
+            iterations, data_type);
   if (FLAGS_record_benchmark) {
     Benchmark benchmark;
     benchmark.SetName(FLAGS_model_name);
@@ -368,12 +372,13 @@ void PredictionRun(PaddlePredictor *predictor,
 void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
+    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true,
+    const VarType::Type data_type = VarType::FP32) {
   auto predictor = CreateTestPredictor(config, use_analysis);
   if (FLAGS_warmup) {
-    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type);
   }
-  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type);
 }
 
 void TestMultiThreadPrediction(
@@ -505,13 +510,14 @@ void CompareQuantizedAndAnalysis(
   auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
   PrintConfig(cfg, true);
   std::vector<std::vector<PaddleTensor>> analysis_outputs;
-  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true);
+  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32);
 
   LOG(INFO) << "--- INT8 prediction start ---";
   auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
   PrintConfig(qcfg, true);
   std::vector<std::vector<PaddleTensor>> quantized_outputs;
-  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true);
+  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
+                          VarType::INT8);
 
   LOG(INFO) << "--- comparing outputs --- ";
   CompareTopAccuracy(quantized_outputs, analysis_outputs);
@@ -640,7 +646,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
   }
 
   for (size_t i = 0; i < a_size; i++) {
-    if (a.type() == framework::proto::VarType::FP32) {
+    if (a.type() == VarType::FP32) {
       const auto *a_data = a.data<float>();
       const auto *b_data = b.data<float>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
@@ -649,7 +655,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
             b_data[i]);
         return false;
       }
-    } else if (a.type() == framework::proto::VarType::INT64) {
+    } else if (a.type() == VarType::INT64) {
       const auto *a_data = a.data<int64_t>();
       const auto *b_data = b.data<int64_t>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {