import paddle from paddle.fluid.framework import IrGraph from paddle.fluid import core from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass, AddQuantDequantPass, QuantizationFreezePass try: from paddle.fluid.contrib.slim.quantization import utils TRANSFORM_PASS_OP_TYPES = utils._weight_supported_quantizable_op_type QUANT_DEQUANT_PASS_OP_TYPES = utils._act_supported_quantizable_op_type except: TRANSFORM_PASS_OP_TYPES = QuantizationTransformPass._supported_quantizable_op_type QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type def post_quant_fake(executor, model_dir, model_filename=None, params_filename=None, save_model_path=None, quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"], is_full_quantize=False, activation_bits=8, weight_bits=8): """ Utilizing post training quantization methon to quantize the FP32 model, and it not uses calibrate data and the fake model cannot be used in practice. Usage: paddle.enable_static() place = paddle.CPUPlace() exe = paddle.static.Executor(place) post_quant_fake(executor=exe, model_dir='./inference_model/MobileNet/', model_filename='model', params_filename='params', save_model_path='fake_quant') """ activation_quantize_type = 'range_abs_max' weight_quantize_type = 'channel_wise_abs_max' _dynamic_quantize_op_type = ['lstm'] _weight_supported_quantizable_op_type = TRANSFORM_PASS_OP_TYPES _act_supported_quantizable_op_type = QUANT_DEQUANT_PASS_OP_TYPES _support_quantize_op_type = list( set(_weight_supported_quantizable_op_type + _act_supported_quantizable_op_type + _dynamic_quantize_op_type)) _place = executor.place _scope = paddle.static.global_scope() if is_full_quantize: _quantizable_op_type = _support_quantize_op_type else: _quantizable_op_type = quantizable_op_type for op_type in _quantizable_op_type: assert op_type in _support_quantize_op_type, \ op_type + " is not supported for quantization." _program, _feed_list, _fetch_list = paddle.fluid.io.load_inference_model( model_dir, executor, model_filename=model_filename, params_filename=params_filename) graph = IrGraph(core.Graph(_program.desc), for_test=True) # use QuantizationTransformPass to insert fake_quant/fake_dequantize op major_quantizable_op_types = [] for op_type in _weight_supported_quantizable_op_type: if op_type in _quantizable_op_type: major_quantizable_op_types.append(op_type) transform_pass = QuantizationTransformPass( scope=_scope, place=_place, weight_bits=weight_bits, activation_bits=activation_bits, activation_quantize_type=activation_quantize_type, weight_quantize_type=weight_quantize_type, quantizable_op_type=major_quantizable_op_types) for sub_graph in graph.all_sub_graphs(): # Insert fake_quant/fake_dequantize op must in test graph, so # set per graph's _for_test is True. sub_graph._for_test = True transform_pass.apply(sub_graph) # use AddQuantDequantPass to insert fake_quant_dequant op minor_quantizable_op_types = [] for op_type in _act_supported_quantizable_op_type: if op_type in _quantizable_op_type: minor_quantizable_op_types.append(op_type) add_quant_dequant_pass = AddQuantDequantPass( scope=_scope, place=_place, quantizable_op_type=minor_quantizable_op_types) for sub_graph in graph.all_sub_graphs(): sub_graph._for_test = True add_quant_dequant_pass.apply(sub_graph) # apply QuantizationFreezePass, and obtain the final quant model freeze_pass = QuantizationFreezePass( scope=_scope, place=_place, weight_bits=weight_bits, activation_bits=activation_bits, weight_quantize_type=weight_quantize_type, quantizable_op_type=major_quantizable_op_types) for sub_graph in graph.all_sub_graphs(): sub_graph._for_test = True freeze_pass.apply(sub_graph) _program = graph.to_program() paddle.fluid.io.save_inference_model( dirname=save_model_path, model_filename=model_filename, params_filename=params_filename, feeded_var_names=_feed_list, target_vars=_fetch_list, executor=executor, main_program=_program) print("The quantized model is saved in: " + save_model_path)