From ca5642c8502a6a11ec6eb75e1dca3b5b5ef5b15e Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Sun, 16 Jun 2019 16:36:50 +0200 Subject: [PATCH] unify FP32 vs. INT8 comparison tests output (#18111) test=develop --- .../fluid/inference/tests/api/tester_helper.h | 59 +++++++++++++++---- .../contrib/slim/tests/qat_int8_comparison.py | 56 ++++++++++-------- .../test_mkldnn_int8_quantization_strategy.py | 22 ++++--- 3 files changed, 91 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index eda86c3b42b..eb786196a88 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -320,7 +320,8 @@ void PredictionRun(PaddlePredictor *predictor, const std::vector> &inputs, std::vector> *outputs, int num_threads, int tid, - const VarType::Type data_type = VarType::FP32) { + const VarType::Type data_type = VarType::FP32, + float *sample_latency = nullptr) { int num_times = FLAGS_repeat; int iterations = inputs.size(); // process the whole dataset ... if (FLAGS_iterations > 0 && @@ -360,6 +361,10 @@ void PredictionRun(PaddlePredictor *predictor, auto batch_latency = elapsed_time / (iterations * num_times); PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency, iterations, data_type); + + if (sample_latency != nullptr) + *sample_latency = batch_latency / FLAGS_batch_size; + if (FLAGS_record_benchmark) { Benchmark benchmark; benchmark.SetName(FLAGS_model_name); @@ -373,12 +378,14 @@ void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector> *outputs, bool use_analysis = true, - const VarType::Type data_type = VarType::FP32) { + const VarType::Type data_type = VarType::FP32, + float *sample_latency = nullptr) { auto predictor = CreateTestPredictor(config, use_analysis); if (FLAGS_warmup) { PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type); } - PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type); + PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type, + sample_latency); } void TestMultiThreadPrediction( @@ -430,6 +437,31 @@ void TestPrediction(const PaddlePredictor::Config *config, } } +void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) { + LOG(INFO) << "--- Accuracy summary --- "; + LOG(INFO) << "Accepted top1 accuracy drop threshold: " + << FLAGS_quantized_accuracy + << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)"; + LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_fp32; + LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_int8; +} + +void SummarizePerformance(float sample_latency_fp32, + float sample_latency_int8) { + // sample latency in ms + auto throughput_fp32 = 1000.0 / sample_latency_fp32; + auto throughput_int8 = 1000.0 / sample_latency_int8; + LOG(INFO) << "--- Performance summary --- "; + LOG(INFO) << "FP32: avg fps: " << std::fixed << std::setw(6) + << std::setprecision(4) << throughput_fp32 + << ", avg latency: " << sample_latency_fp32 << " ms"; + LOG(INFO) << "INT8: avg fps: " << std::fixed << std::setw(6) + << std::setprecision(4) << throughput_int8 + << ", avg latency: " << sample_latency_int8 << " ms"; +} + void CompareTopAccuracy( const std::vector> &output_slots_quant, const std::vector> &output_slots_ref) { @@ -459,12 +491,10 @@ void CompareTopAccuracy( float avg_acc1_quant = total_accs1_quant / output_slots_quant.size(); float avg_acc1_ref = total_accs1_ref / output_slots_ref.size(); - LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6) - << std::setprecision(4) << avg_acc1_quant; - LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6) - << std::setprecision(4) << avg_acc1_ref; - LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy; - CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy); + SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant); + CHECK_GT(avg_acc1_ref, 0.0); + CHECK_GT(avg_acc1_quant, 0.0); + CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy); } void CompareDeterministic( @@ -510,16 +540,19 @@ void CompareQuantizedAndAnalysis( auto *cfg = reinterpret_cast(config); PrintConfig(cfg, true); std::vector> analysis_outputs; - TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32); + float sample_latency_fp32{-1}; + TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32, + &sample_latency_fp32); LOG(INFO) << "--- INT8 prediction start ---"; auto *qcfg = reinterpret_cast(qconfig); PrintConfig(qcfg, true); std::vector> quantized_outputs; - TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, - VarType::INT8); + float sample_latency_int8{-1}; + TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, VarType::INT8, + &sample_latency_int8); - LOG(INFO) << "--- comparing outputs --- "; + SummarizePerformance(sample_latency_fp32, sample_latency_int8); CompareTopAccuracy(quantized_outputs, analysis_outputs); } diff --git a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py b/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py index f8cd5a663ec..6673811a791 100644 --- a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py +++ b/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py @@ -83,8 +83,8 @@ class TestQatInt8Comparison(unittest.TestCase): while step < num: fp.seek(imgs_offset + img_size * step) img = fp.read(img_size) - img = struct.unpack_from('{}f'.format(img_ch * img_w * - img_h), img) + img = struct.unpack_from( + '{}f'.format(img_ch * img_w * img_h), img) img = np.array(img) img.shape = (img_ch, img_w, img_h) fp.seek(labels_offset + label_size * step) @@ -147,6 +147,7 @@ class TestQatInt8Comparison(unittest.TestCase): def _predict(self, test_reader=None, model_path=None, + batch_size=1, batch_num=1, skip_batch_num=0, transform_to_int8=False): @@ -199,7 +200,7 @@ class TestQatInt8Comparison(unittest.TestCase): out = exe.run(inference_program, feed={feed_target_names[0]: images}, fetch_list=fetch_targets) - batch_time = time.time() - start + batch_time = (time.time() - start) * 1000 # in miliseconds outputs.append(out[0]) batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0], labels) @@ -212,14 +213,15 @@ class TestQatInt8Comparison(unittest.TestCase): fpses.append(fps) iters += 1 appx = ' (warm-up)' if iters <= skip_batch_num else '' - _logger.info( - 'batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, ' - 'batch latency: {3:.4f} s, batch fps: {4:.2f}'.format( - iters, batch_acc1, batch_acc5, batch_time, fps, appx)) + _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, ' + 'latency: {3:.4f} ms, fps: {4:.2f}'.format( + iters, batch_acc1, batch_acc5, batch_time / + batch_size, fps, appx)) # Postprocess benchmark data - latencies = batch_times[skip_batch_num:] - latency_avg = np.average(latencies) + batch_latencies = batch_times[skip_batch_num:] + batch_latency_avg = np.average(batch_latencies) + latency_avg = batch_latency_avg / batch_size fpses = fpses[skip_batch_num:] fps_avg = np.average(fpses) infer_total_time = time.time() - infer_start_time @@ -230,13 +232,25 @@ class TestQatInt8Comparison(unittest.TestCase): return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg + def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat): + _logger.info('--- Performance summary ---') + _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format( + fp32_fps, fp32_lat)) + _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format( + int8_fps, int8_lat)) + def _compare_accuracy(self, fp32_acc1, fp32_acc5, int8_acc1, int8_acc5, threshold): - _logger.info('Accepted acc1 diff threshold: {0}'.format(threshold)) - _logger.info('FP32: avg acc1: {0:.4f}, avg acc5: {1:.4f}'.format( - fp32_acc1, fp32_acc5)) - _logger.info('INT8: avg acc1: {0:.4f}, avg acc5: {1:.4f}'.format( - int8_acc1, int8_acc5)) + _logger.info('--- Accuracy summary ---') + _logger.info( + 'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)' + .format(threshold)) + _logger.info( + 'FP32: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'. + format(fp32_acc1, fp32_acc5)) + _logger.info( + 'INT8: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'. + format(int8_acc1, int8_acc5)) assert fp32_acc1 > 0.0 assert int8_acc1 > 0.0 assert fp32_acc1 - int8_acc1 <= threshold @@ -257,9 +271,7 @@ class TestQatInt8Comparison(unittest.TestCase): _logger.info('Dataset: {0}'.format(data_path)) _logger.info('Batch size: {0}'.format(batch_size)) _logger.info('Batch number: {0}'.format(batch_num)) - _logger.info('Accuracy diff threshold: {0}. ' - '(condition: (fp32_acc - int8_acc) <= threshold)' - .format(acc_diff_threshold)) + _logger.info('Accuracy drop threshold: {0}.'.format(acc_diff_threshold)) _logger.info('--- QAT FP32 prediction start ---') val_reader = paddle.batch( @@ -267,6 +279,7 @@ class TestQatInt8Comparison(unittest.TestCase): fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict( val_reader, qat_model_path, + batch_size, batch_num, skip_batch_num, transform_to_int8=False) @@ -277,17 +290,12 @@ class TestQatInt8Comparison(unittest.TestCase): int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict( val_reader, qat_model_path, + batch_size, batch_num, skip_batch_num, transform_to_int8=True) - _logger.info('--- Performance summary ---') - _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} s'.format( - fp32_fps, fp32_lat)) - _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} s'.format( - int8_fps, int8_lat)) - - _logger.info('--- Comparing accuracy ---') + self._summarize_performance(fp32_fps, fp32_lat, int8_fps, int8_lat) self._compare_accuracy(fp32_acc1, fp32_acc5, int8_acc1, int8_acc5, acc_diff_threshold) diff --git a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py index 0db77ca4fb4..1c41a316a62 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py @@ -172,6 +172,17 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase): com_pass.config(config_path) com_pass.run() + def _compare_accuracy(self, fp32_acc1, int8_acc1, threshold): + _logger.info('--- Accuracy summary ---') + _logger.info( + 'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)' + .format(threshold)) + _logger.info('FP32: avg top1 accuracy: {0:.4f}'.format(fp32_acc1)) + _logger.info('INT8: avg top1 accuracy: {0:.4f}'.format(int8_acc1)) + assert fp32_acc1 > 0.0 + assert int8_acc1 > 0.0 + assert fp32_acc1 - int8_acc1 <= threshold + def test_compression(self): if not fluid.core.is_compiled_with_mkldnn(): return @@ -204,15 +215,8 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase): self._reader_creator(data_path, False), batch_size=batch_size) fp32_model_result = self._predict(val_reader, fp32_model_path) - _logger.info('--- comparing outputs ---') - _logger.info('Avg top1 INT8 accuracy: {0:.4f}'.format(int8_model_result[ - 0])) - _logger.info('Avg top1 FP32 accuracy: {0:.4f}'.format(fp32_model_result[ - 0])) - _logger.info('Accepted accuracy drop threshold: {0}'.format( - accuracy_diff_threshold)) - assert fp32_model_result[0] - int8_model_result[ - 0] <= accuracy_diff_threshold + self._compare_accuracy(fp32_model_result[0], int8_model_result[0], + accuracy_diff_threshold) if __name__ == '__main__': -- GitLab