diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 112623d23a65f2cd6e2747e24f3fb72c9d9b5cf3..7930923668c7d77ebf90e229e4c1032575fac710 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -93,7 +93,8 @@ class Quant2Int8MkldnnPass(object): graph = self._dequantize_weights(graph) graph = self._optimize_fp32_graph(graph) graph = self._compute_weight_scales(graph) - graph = self._update_relu_output_scales(graph) + # This function causes nondeterministic quantization behavior + # graph = self._update_relu_output_scales(graph) graph = self._propagate_scales(graph) graph = self._quantize_fp32_graph(graph) graph = self._final_optimizations(graph) diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index e55db665052cec0176e4e070f2bcb06190fabde7..03503111fca9a6e259aefe8657ac07a69e6bcaf1 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -92,17 +92,14 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da --ops_to_quantize ${ops_to_quantize}) endfunction() -function(inference_quant2_int8_lstm_model_test target fp32_model dataset_path) +function(inference_quant2_int8_lstm_model_test target fp32_model quant_model dataset_path) py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py" - ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} - OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} - FLAGS_use_mkldnn=true ARGS --fp32_model ${fp32_model} + --quant_model ${quant_model} --infer_data ${dataset_path} - --num_threads 4 + --num_threads 1 --mkldnn_cache_capacity 100 --warmup_iter 100 - --warmup_batch_size 1 --acc_diff_threshold 0.11) endfunction() @@ -293,11 +290,10 @@ if(LINUX AND WITH_MKLDNN) # PTQ int8 lstm model set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz") - set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2") download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7) set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz") download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743) - inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data) + inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data) endif() diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py index 0e33bd8ba1a4e085fc46ff132a20c1a4a06360bf..4f4a2ddd4ab417096776f5c1da70b3e1860160ee 100644 --- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py +++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py @@ -20,30 +20,28 @@ import time import unittest from paddle import fluid from paddle.fluid.core import AnalysisConfig, create_paddle_predictor +from save_quant_model import transform_and_save_int8_model def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( '--fp32_model', type=str, default='', help='A path to a FP32 model.') - parser.add_argument('--infer_data', type=str, default='', help='Data file.') parser.add_argument( - '--num_threads', type=int, default=1, help='Number of threads.') + '--quant_model', type=str, default='', help='A path to a quant model.') + parser.add_argument('--infer_data', type=str, default='', help='Data file.') parser.add_argument( '--warmup_iter', type=int, default=1, help='Number of the first iterations to skip in performance statistics.') - parser.add_argument( - '--warmup_batch_size', - type=int, - default=1, - help='Number of batches to use in PTQ warmup. Default: 1.') parser.add_argument( '--acc_diff_threshold', type=float, default=0.01, help='Accepted accuracy difference threshold.') + parser.add_argument( + '--num_threads', type=int, default=1, help='Number of threads.') parser.add_argument( '--mkldnn_cache_capacity', type=int, @@ -56,7 +54,7 @@ def parse_args(): class TestLstmModelPTQ(unittest.TestCase): - def get_warmup_tensor(self, data_path, place, warmup_batch_size): + def get_warmup_tensor(self, data_path, place): data = [] with open(data_path, 'rb') as in_f: while True: @@ -87,30 +85,31 @@ class TestLstmModelPTQ(unittest.TestCase): infer_label.shape = label.shape infer_label.dtype = fluid.core.PaddleDType.INT32 data.append([infer_data, infer_label]) - warmup_data = data[:warmup_batch_size] - inputs = data[warmup_batch_size:] + warmup_data = data[:1] + inputs = data[1:] return warmup_data, inputs def set_config(self, model_path, num_threads, mkldnn_cache_capacity, - warmup_batch_size, warmup_data=None, - enable_int8=False): + use_analysis=False, + enable_ptq=False): config = AnalysisConfig(model_path) - config.disable_gpu() - config.switch_use_feed_fetch_ops(True) - config.switch_ir_optim(True) config.set_cpu_math_library_num_threads(num_threads) - # This pass to work properly, must be added before fc_fuse_pass - config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass") - config.enable_mkldnn() - config.set_mkldnn_cache_capacity(mkldnn_cache_capacity) - if enable_int8: - config.enable_quantizer() - config.quantizer_config().set_quant_data(warmup_data) - config.quantizer_config().set_quant_batch_size(warmup_batch_size) + if use_analysis: + config.disable_gpu() + config.switch_use_feed_fetch_ops(True) + config.switch_ir_optim(True) + config.enable_mkldnn() + config.set_mkldnn_cache_capacity(mkldnn_cache_capacity) + if enable_ptq: + # This pass to work properly, must be added before fc_fuse_pass + config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass") + config.enable_quantizer() + config.quantizer_config().set_quant_data(warmup_data) + config.quantizer_config().set_quant_batch_size(1) return config def run_program(self, @@ -119,15 +118,13 @@ class TestLstmModelPTQ(unittest.TestCase): num_threads, mkldnn_cache_capacity, warmup_iter, - warmup_batch_size, - enable_ptq_int8=False): + use_analysis=False, + enable_ptq=False): place = fluid.CPUPlace() - warmup_data, inputs = self.get_warmup_tensor(data_path, place, - warmup_batch_size) + warmup_data, inputs = self.get_warmup_tensor(data_path, place) warmup_data = [item[0] for item in warmup_data] config = self.set_config(model_path, num_threads, mkldnn_cache_capacity, - warmup_batch_size, warmup_data, - enable_ptq_int8) + warmup_data, use_analysis, enable_ptq) predictor = create_paddle_predictor(config) data = [item[0] for item in inputs] @@ -183,34 +180,47 @@ class TestLstmModelPTQ(unittest.TestCase): fp32_model = test_case_args.fp32_model assert fp32_model, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.' + quant_model = test_case_args.quant_model + assert quant_model, 'The quant model path cannot be empty. Please, use the --quant_model option.' infer_data = test_case_args.infer_data assert infer_data, 'The dataset path cannot be empty. Please, use the --infer_data option.' num_threads = test_case_args.num_threads mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity warmup_iter = test_case_args.warmup_iter - warmup_batch_size = test_case_args.warmup_batch_size acc_diff_threshold = test_case_args.acc_diff_threshold (fp32_hx_acc, fp32_ctc_acc, fp32_fps) = self.run_program( fp32_model, infer_data, num_threads, mkldnn_cache_capacity, - warmup_iter, warmup_batch_size, False) + warmup_iter, False, False) (int8_hx_acc, int8_ctc_acc, int8_fps) = self.run_program( fp32_model, infer_data, num_threads, mkldnn_cache_capacity, - warmup_iter, warmup_batch_size, True) + warmup_iter, True, True) + + quant_model_save_path = quant_model + "_int8" + # transform model to quant2 + transform_and_save_int8_model(quant_model, quant_model_save_path, + "fusion_lstm,concat") - print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}.".format( + (quant_hx_acc, quant_ctc_acc, quant_fps) = self.run_program( + quant_model_save_path, infer_data, num_threads, + mkldnn_cache_capacity, warmup_iter, True, False) + + print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}".format( fp32_fps, fp32_hx_acc, fp32_ctc_acc)) - print("PTQ INT8: fps {0}, hx_acc {1}, ctc_acc {2}.".format( + print("PTQ_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format( int8_fps, int8_hx_acc, int8_ctc_acc)) + print("QUANT2_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format( + quant_fps, quant_hx_acc, quant_ctc_acc)) + sys.stdout.flush() - hx_delta_value = fp32_hx_acc - int8_hx_acc - ctc_delta_value = fp32_ctc_acc - int8_ctc_acc - self.assertLess(hx_delta_value, acc_diff_threshold) - self.assertLess(ctc_delta_value, acc_diff_threshold) + self.assertLess(fp32_hx_acc - int8_hx_acc, acc_diff_threshold) + self.assertLess(fp32_ctc_acc - int8_ctc_acc, acc_diff_threshold) + self.assertLess(fp32_hx_acc - quant_hx_acc, acc_diff_threshold) + self.assertLess(fp32_ctc_acc - quant_ctc_acc, acc_diff_threshold) if __name__ == "__main__": diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index e38148250af2177801995d263dc6d3c9502bc501..3fadf25150f9ef3556a343fdce8acc24d788f5dc 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -16,11 +16,6 @@ import unittest import os import sys import argparse -import logging -import struct -import six -import numpy as np -import time import paddle import paddle.fluid as fluid from paddle.fluid.framework import IrGraph @@ -62,7 +57,11 @@ def parse_args(): return test_args, sys.argv[:1] + args -def transform_and_save_int8_model(original_path, save_path): +def transform_and_save_int8_model(original_path, + save_path, + ops_to_quantize='', + op_ids_to_skip='', + debug=False): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() @@ -75,24 +74,26 @@ def transform_and_save_int8_model(original_path, save_path): fetch_targets] = fluid.io.load_inference_model(original_path, exe, 'model', 'params') - ops_to_quantize = set() - if len(test_args.ops_to_quantize) > 0: - ops_to_quantize = set(test_args.ops_to_quantize.split(',')) + ops_to_quantize_set = set() + print(ops_to_quantize) + if len(ops_to_quantize) > 0: + ops_to_quantize_set = set(ops_to_quantize.split(',')) - op_ids_to_skip = set([-1]) - if len(test_args.op_ids_to_skip) > 0: - op_ids_to_skip = set(map(int, test_args.op_ids_to_skip.split(','))) + op_ids_to_skip_set = set([-1]) + print(op_ids_to_skip) + if len(op_ids_to_skip) > 0: + op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(','))) graph = IrGraph(core.Graph(inference_program.desc), for_test=True) - if (test_args.debug): + if (debug): graph.draw('.', 'quant_orig', graph.all_op_nodes()) transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass( - ops_to_quantize, - _op_ids_to_skip=op_ids_to_skip, + ops_to_quantize_set, + _op_ids_to_skip=op_ids_to_skip_set, _scope=inference_scope, _place=place, _core=core, - _debug=test_args.debug) + _debug=debug) graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): @@ -106,5 +107,6 @@ def transform_and_save_int8_model(original_path, save_path): if __name__ == '__main__': global test_args test_args, remaining_args = parse_args() - transform_and_save_int8_model(test_args.quant_model_path, - test_args.int8_model_save_path) + transform_and_save_int8_model( + test_args.quant_model_path, test_args.int8_model_save_path, + test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)