unify FP32 vs. INT8 comparison tests output (#18111)

test=develop

unify FP32 vs. INT8 comparison tests output (#18111)
test=develop
ca5642c8 · Wojciech Uss · Tao Luo · c26130f3 · ca5642c8 · ca5642c8
3 changed file
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -320,7 +320,8 @@ void PredictionRun(PaddlePredictor *predictor,
                   const std::vector<std::vector<PaddleTensor>> &inputs,
                   std::vector<std::vector<PaddleTensor>> *outputs,
                   int num_threads, int tid,
-                   const VarType::Type data_type = VarType::FP32) {
+                   const VarType::Type data_type = VarType::FP32,
+                   float *sample_latency = nullptr) {
  int num_times = FLAGS_repeat;
  int iterations = inputs.size();  // process the whole dataset ...
  if (FLAGS_iterations > 0 &&
@@ -360,6 +361,10 @@ void PredictionRun(PaddlePredictor *predictor,
  auto batch_latency = elapsed_time / (iterations * num_times);
  PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
            iterations, data_type);
+
+  if (sample_latency != nullptr)
+    *sample_latency = batch_latency / FLAGS_batch_size;
+
  if (FLAGS_record_benchmark) {
    Benchmark benchmark;
    benchmark.SetName(FLAGS_model_name);
@@ -373,12 +378,14 @@ void TestOneThreadPrediction(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true,
-    const VarType::Type data_type = VarType::FP32) {
+    const VarType::Type data_type = VarType::FP32,
+    float *sample_latency = nullptr) {
  auto predictor = CreateTestPredictor(config, use_analysis);
  if (FLAGS_warmup) {
    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type);
  }
-  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type,
+                sample_latency);
 }

 void TestMultiThreadPrediction(
@@ -430,6 +437,31 @@ void TestPrediction(const PaddlePredictor::Config *config,
  }
 }

+void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) {
+  LOG(INFO) << "--- Accuracy summary --- ";
+  LOG(INFO) << "Accepted top1 accuracy drop threshold: "
+            << FLAGS_quantized_accuracy
+            << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)";
+  LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_fp32;
+  LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_int8;
+}
+
+void SummarizePerformance(float sample_latency_fp32,
+                          float sample_latency_int8) {
+  // sample latency in ms
+  auto throughput_fp32 = 1000.0 / sample_latency_fp32;
+  auto throughput_int8 = 1000.0 / sample_latency_int8;
+  LOG(INFO) << "--- Performance summary --- ";
+  LOG(INFO) << "FP32: avg fps: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << throughput_fp32
+            << ", avg latency: " << sample_latency_fp32 << " ms";
+  LOG(INFO) << "INT8: avg fps: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << throughput_int8
+            << ", avg latency: " << sample_latency_int8 << " ms";
+}
+
 void CompareTopAccuracy(
    const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
    const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
@@ -459,12 +491,10 @@ void CompareTopAccuracy(
  float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
  float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();

-  LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc1_quant;
-  LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc1_ref;
-  LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
-  CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy);
+  SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant);
+  CHECK_GT(avg_acc1_ref, 0.0);
+  CHECK_GT(avg_acc1_quant, 0.0);
+  CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy);
 }

 void CompareDeterministic(
@@ -510,16 +540,19 @@ void CompareQuantizedAndAnalysis(
  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
  PrintConfig(cfg, true);
  std::vector<std::vector<PaddleTensor>> analysis_outputs;
-  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32);
+  float sample_latency_fp32{-1};
+  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
+                          &sample_latency_fp32);

  LOG(INFO) << "--- INT8 prediction start ---";
  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
  PrintConfig(qcfg, true);
  std::vector<std::vector<PaddleTensor>> quantized_outputs;
-  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
-                          VarType::INT8);
+  float sample_latency_int8{-1};
+  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, VarType::INT8,
+                          &sample_latency_int8);

-  LOG(INFO) << "--- comparing outputs --- ";
+  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
  CompareTopAccuracy(quantized_outputs, analysis_outputs);
 }


--- a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
@@ -83,8 +83,8 @@ class TestQatInt8Comparison(unittest.TestCase):
                while step < num:
                    fp.seek(imgs_offset + img_size * step)
                    img = fp.read(img_size)
-                    img = struct.unpack_from('{}f'.format(img_ch * img_w *
-                                                          img_h), img)
+                    img = struct.unpack_from(
+                        '{}f'.format(img_ch * img_w * img_h), img)
                    img = np.array(img)
                    img.shape = (img_ch, img_w, img_h)
                    fp.seek(labels_offset + label_size * step)
@@ -147,6 +147,7 @@ class TestQatInt8Comparison(unittest.TestCase):
    def _predict(self,
                 test_reader=None,
                 model_path=None,
+                 batch_size=1,
                 batch_num=1,
                 skip_batch_num=0,
                 transform_to_int8=False):
@@ -199,7 +200,7 @@ class TestQatInt8Comparison(unittest.TestCase):
                out = exe.run(inference_program,
                              feed={feed_target_names[0]: images},
                              fetch_list=fetch_targets)
-                batch_time = time.time() - start
+                batch_time = (time.time() - start) * 1000  # in miliseconds
                outputs.append(out[0])
                batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
                                                                  labels)
@@ -212,14 +213,15 @@ class TestQatInt8Comparison(unittest.TestCase):
                fpses.append(fps)
                iters += 1
                appx = ' (warm-up)' if iters <= skip_batch_num else ''
-                _logger.info(
-                    'batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
-                    'batch latency: {3:.4f} s, batch fps: {4:.2f}'.format(
-                        iters, batch_acc1, batch_acc5, batch_time, fps, appx))
+                _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
+                             'latency: {3:.4f} ms, fps: {4:.2f}'.format(
+                                 iters, batch_acc1, batch_acc5, batch_time /
+                                 batch_size, fps, appx))

            # Postprocess benchmark data
-            latencies = batch_times[skip_batch_num:]
-            latency_avg = np.average(latencies)
+            batch_latencies = batch_times[skip_batch_num:]
+            batch_latency_avg = np.average(batch_latencies)
+            latency_avg = batch_latency_avg / batch_size
            fpses = fpses[skip_batch_num:]
            fps_avg = np.average(fpses)
            infer_total_time = time.time() - infer_start_time
@@ -230,13 +232,25 @@ class TestQatInt8Comparison(unittest.TestCase):

            return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg

+    def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat):
+        _logger.info('--- Performance summary ---')
+        _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
+            fp32_fps, fp32_lat))
+        _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
+            int8_fps, int8_lat))
+
    def _compare_accuracy(self, fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
                          threshold):
-        _logger.info('Accepted acc1 diff threshold: {0}'.format(threshold))
-        _logger.info('FP32: avg acc1: {0:.4f}, avg acc5: {1:.4f}'.format(
-            fp32_acc1, fp32_acc5))
-        _logger.info('INT8: avg acc1: {0:.4f}, avg acc5: {1:.4f}'.format(
-            int8_acc1, int8_acc5))
+        _logger.info('--- Accuracy summary ---')
+        _logger.info(
+            'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
+            .format(threshold))
+        _logger.info(
+            'FP32: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
+            format(fp32_acc1, fp32_acc5))
+        _logger.info(
+            'INT8: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
+            format(int8_acc1, int8_acc5))
        assert fp32_acc1 > 0.0
        assert int8_acc1 > 0.0
        assert fp32_acc1 - int8_acc1 <= threshold
@@ -257,9 +271,7 @@ class TestQatInt8Comparison(unittest.TestCase):
        _logger.info('Dataset: {0}'.format(data_path))
        _logger.info('Batch size: {0}'.format(batch_size))
        _logger.info('Batch number: {0}'.format(batch_num))
-        _logger.info('Accuracy diff threshold: {0}. '
-                     '(condition: (fp32_acc - int8_acc) <= threshold)'
-                     .format(acc_diff_threshold))
+        _logger.info('Accuracy drop threshold: {0}.'.format(acc_diff_threshold))

        _logger.info('--- QAT FP32 prediction start ---')
        val_reader = paddle.batch(
@@ -267,6 +279,7 @@ class TestQatInt8Comparison(unittest.TestCase):
        fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
            val_reader,
            qat_model_path,
+            batch_size,
            batch_num,
            skip_batch_num,
            transform_to_int8=False)
@@ -277,17 +290,12 @@ class TestQatInt8Comparison(unittest.TestCase):
        int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
            val_reader,
            qat_model_path,
+            batch_size,
            batch_num,
            skip_batch_num,
            transform_to_int8=True)

-        _logger.info('--- Performance summary ---')
-        _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} s'.format(
-            fp32_fps, fp32_lat))
-        _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} s'.format(
-            int8_fps, int8_lat))
-
-        _logger.info('--- Comparing accuracy ---')
+        self._summarize_performance(fp32_fps, fp32_lat, int8_fps, int8_lat)
        self._compare_accuracy(fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
                               acc_diff_threshold)


--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
@@ -172,6 +172,17 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
        com_pass.config(config_path)
        com_pass.run()

+    def _compare_accuracy(self, fp32_acc1, int8_acc1, threshold):
+        _logger.info('--- Accuracy summary ---')
+        _logger.info(
+            'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
+            .format(threshold))
+        _logger.info('FP32: avg top1 accuracy: {0:.4f}'.format(fp32_acc1))
+        _logger.info('INT8: avg top1 accuracy: {0:.4f}'.format(int8_acc1))
+        assert fp32_acc1 > 0.0
+        assert int8_acc1 > 0.0
+        assert fp32_acc1 - int8_acc1 <= threshold
+
    def test_compression(self):
        if not fluid.core.is_compiled_with_mkldnn():
            return
@@ -204,15 +215,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
            self._reader_creator(data_path, False), batch_size=batch_size)
        fp32_model_result = self._predict(val_reader, fp32_model_path)

-        _logger.info('--- comparing outputs ---')
-        _logger.info('Avg top1 INT8 accuracy: {0:.4f}'.format(int8_model_result[
-            0]))
-        _logger.info('Avg top1 FP32 accuracy: {0:.4f}'.format(fp32_model_result[
-            0]))
-        _logger.info('Accepted accuracy drop threshold: {0}'.format(
-            accuracy_diff_threshold))
-        assert fp32_model_result[0] - int8_model_result[
-            0] <= accuracy_diff_threshold
+        self._compare_accuracy(fp32_model_result[0], int8_model_result[0],
+                               accuracy_diff_threshold)


 if __name__ == '__main__':