diff --git a/paddleslim/quant/__init__.py b/paddleslim/quant/__init__.py index 31199ef189037f43f80d3dcd9e9e75f8fba3bbd8..5eaac4e11efcfbae2d9ca02f05d86b93d469093c 100644 --- a/paddleslim/quant/__init__.py +++ b/paddleslim/quant/__init__.py @@ -25,6 +25,6 @@ try: except Exception as e: _logger.warning( "If you want to use training-aware and post-training quantization, " - "please use Paddle >= 1.7.0 or develop version") + "please use Paddle >= 2.0.0 or develop version") from .quant_embedding import quant_embedding diff --git a/paddleslim/quant/quanter.py b/paddleslim/quant/quanter.py index f3afbd636e5813bf2f15b8f396d055da5ea338ae..a2497d76d2903920508cfdc7d114019798e83957 100755 --- a/paddleslim/quant/quanter.py +++ b/paddleslim/quant/quanter.py @@ -24,6 +24,8 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass from paddle.fluid.contrib.slim.quantization import TransformForMobilePass from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass +from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass +from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass from paddle.fluid import core from paddle.fluid.contrib.slim.quantization import WeightQuantization @@ -46,8 +48,8 @@ ACTIVATION_QUANTIZATION_TYPES_TENSORRT = [ VALID_DTYPES = ['int8'] TRANSFORM_PASS_OP_TYPES = QuantizationTransformPass._supported_quantizable_op_type -QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type + \ - AddQuantDequantPass._activation_type +QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type + TENSORRT_OP_TYPES = [ 'mul', 'conv2d', 'pool2d', 'depthwise_conv2d', 'elementwise_add', 'leaky_relu' @@ -220,6 +222,10 @@ def quant_aware(program, place, config=None, scope=None, for_test=False): quantizable_op_type=quant_dequant_ops) quant_dequant_pass.apply(main_graph) + out_scale_training_pass = OutScaleForTrainingPass( + scope=scope, place=place, moving_rate=config['moving_rate']) + out_scale_training_pass.apply(main_graph) + if for_test: quant_program = main_graph.to_program() else: @@ -230,9 +236,12 @@ def quant_aware(program, place, config=None, scope=None, for_test=False): def quant_post(executor, model_dir, quantize_model_path, - sample_generator, + batch_generator=None, + sample_generator=None, model_filename=None, params_filename=None, + save_model_filename='__model__', + save_params_filename='__params__', batch_size=16, batch_nums=None, scope=None, @@ -241,6 +250,8 @@ def quant_post(executor, is_full_quantize=False, weight_bits=8, activation_bits=8, + activation_quantize_type='range_abs_max', + weight_quantize_type='channel_wise_abs_max', is_use_cache_file=False, cache_dir="./temp_post_training"): """ @@ -257,6 +268,10 @@ def quant_post(executor, are under the path. quantize_model_path(str): The path to save quantized model using api ``fluid.io.save_inference_model``. + batch_generator(Python Generator): The batch generator provides + calibrate data for DataLoader, and it returns a batch every + time. For sample_generator and batch_generator, only one + can be set. Beisdes, batch_generator supports lod tensor. sample_generator(Python Generator): The sample generator provides calibrate data for DataLoader, and it only returns a sample every time. model_filename(str, optional): The name of model file. If parameters @@ -265,6 +280,9 @@ def quant_post(executor, When all parameters are saved in a single file, set it as filename. If parameters are saved in separate files, set it as 'None'. Default : 'None'. + save_model_filename(str): The name of model file to save the quantized inference program. Default: '__model__'. + save_params_filename(str): The name of file to save all related parameters. + If it is set None, parameters will be saved in separate files. Default: '__params__'. batch_size(int, optional): The batch size of DataLoader, default is 16. batch_nums(int, optional): If batch_nums is not None, the number of calibrate data is 'batch_size*batch_nums'. If batch_nums is None, use all data @@ -279,6 +297,15 @@ def quant_post(executor, "mul"]. weight_bits(int, optional): quantization bit number for weights. activation_bits(int): quantization bit number for activation. + activation_quantize_type(str): quantization type for activation, + now support 'range_abs_max', 'moving_average_abs_max' and 'abs_max'. + This parameter only specifies the fake ops in quantized model. + If it is 'range_abs_max' or 'moving_average_abs_max', we save the scale + obtained by post training quantization in fake ops. If it + is 'abs_max', the scale will not be saved in fake ops. + weight_quantize_type(str): quantization type for weights, + support 'abs_max' and 'channel_wise_abs_max'. Compared to 'abs_max', + the model accuracy is usually higher when using 'channel_wise_abs_max'. is_full_quantize(bool): if True, apply quantization to all supported quantizable op type. If False, only apply quantization to the input quantizable_op_type. Default is False. is_use_cache_file(bool): If False, all temp data will be saved in memory. If True, @@ -291,6 +318,7 @@ def quant_post(executor, post_training_quantization = PostTrainingQuantization( executor=executor, sample_generator=sample_generator, + batch_generator=batch_generator, model_dir=model_dir, model_filename=model_filename, params_filename=params_filename, @@ -302,10 +330,15 @@ def quant_post(executor, is_full_quantize=is_full_quantize, weight_bits=weight_bits, activation_bits=activation_bits, + activation_quantize_type=activation_quantize_type, + weight_quantize_type=weight_quantize_type, is_use_cache_file=is_use_cache_file, cache_dir=cache_dir) post_training_quantization.quantize() - post_training_quantization.save_quantized_model(quantize_model_path) + post_training_quantization.save_quantized_model( + quantize_model_path, + model_filename=save_model_filename, + params_filename=save_params_filename) def convert(program, place, config=None, scope=None, save_int8=False): @@ -336,12 +369,10 @@ def convert(program, place, config=None, scope=None, save_int8=False): assert isinstance(config, dict), "config must be dict" config = _parse_configs(config) _logger.info("convert config {}".format(config)) - test_graph = IrGraph(core.Graph(program.desc), for_test=True) - support_op_types = [] - for op in config['quantize_op_types']: - if op in QuantizationFreezePass._supported_quantizable_op_type: - support_op_types.append(op) + + out_scale_infer_pass = OutScaleForInferencePass(scope=scope) + out_scale_infer_pass.apply(test_graph) # Freeze the graph after training by adjusting the quantize # operators' order for the inference. @@ -350,16 +381,13 @@ def convert(program, place, config=None, scope=None, save_int8=False): place=place, weight_bits=config['weight_bits'], activation_bits=config['activation_bits'], - weight_quantize_type=config['weight_quantize_type'], - quantizable_op_type=support_op_types) + weight_quantize_type=config['weight_quantize_type']) + freeze_pass.apply(test_graph) freezed_program = test_graph.to_program() if save_int8: - convert_int8_pass = ConvertToInt8Pass( - scope=fluid.global_scope(), - place=place, - quantizable_op_type=support_op_types) + convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass.apply(test_graph) freezed_program_int8 = test_graph.to_program() return freezed_program, freezed_program_int8 diff --git a/tests/test_quant_aware.py b/tests/test_quant_aware.py index 208944b0fb04018caaf13ec58c327f849d2986d5..16c93fe15eda513d2c98a99243208c89aee3b3b4 100644 --- a/tests/test_quant_aware.py +++ b/tests/test_quant_aware.py @@ -132,7 +132,7 @@ class TestQuantAwareCase2(unittest.TestCase): def test(program): iter = 0 result = [[], [], []] - for data in train_reader(): + for data in eval_reader(): cost, top1, top5 = exe.run( program, feed=feeder.feed(data), @@ -161,7 +161,8 @@ class TestQuantAwareCase2(unittest.TestCase): main_prog, place, config, for_test=False) quant_eval_prog = quant_aware(val_prog, place, config, for_test=True) train(quant_train_prog) - quant_eval_prog = convert(quant_eval_prog, place, config) + quant_eval_prog, int8_prog = convert( + quant_eval_prog, place, config, save_int8=True) top1_2, top5_2 = test(quant_eval_prog) # values before quantization and after quantization should be close print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1)) diff --git a/tests/test_quant_post.py b/tests/test_quant_post.py index 8c916b7b3a25a0409abb8bb26576581f596afd33..17068410109f4651f43382a603b1829eb5769413 100644 --- a/tests/test_quant_post.py +++ b/tests/test_quant_post.py @@ -101,12 +101,15 @@ class TestQuantAwareCase1(unittest.TestCase): exe, './test_quant_post', './test_quant_post_inference', - paddle.dataset.mnist.test(), + sample_generator=paddle.dataset.mnist.test(), model_filename='model', params_filename='params', batch_nums=10) quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model( - dirname='./test_quant_post_inference', executor=exe) + dirname='./test_quant_post_inference', + executor=exe, + model_filename='__model__', + params_filename='__params__') top1_2, top5_2 = test(quant_post_prog, fetch_targets) print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1)) print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))