diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py index cde3d991a7f2fda5b2ab5ed57ac5bc0f5d06b143..753d68f79703271b3719cfe6f920d3008c003c64 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py @@ -17,6 +17,7 @@ import logging import numpy as np from .... import core from ....framework import Program, Operator, Variable, program_guard +from ....executor import global_scope from .... import unique_name from ....layer_helper import LayerHelper from ....param_attr import ParamAttr @@ -27,26 +28,49 @@ _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') +def find_next_ops(block, var_name): + """ + Find all followed ops for the input variable. + """ + res_ops = [] + for op in block.ops: + if var_name in op.input_arg_names: + res_ops.append(op) + return res_ops + + +def load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + var_node = scope.find_var(var_name) + assert var_node is not None, \ + "Cannot find " + var_name + " in scope." + return np.array(var_node.get_tensor()) + + class QuantizeTranspilerV2(object): def __init__(self, weight_bits=8, activation_bits=8, weight_quantize_type='abs_max', - activation_quantize_type='abs_max', - quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'], + activation_quantize_type='moving_average_abs_max', + quantizable_op_type=[ + 'conv2d', + 'depthwise_conv2d', + 'mul', + ], skip_pattern=['skip_quant']): """ - Add quant_dequant op before the quantized op to quantize the fluid Program. - It is a patch for distributed quantization, we will support others module for - distributed quantization. + Apply fake quant for the quantized ops. Args: weight_bits(int): the bit of quantized weight. activation_bits(int): the bit of quantized activation. weight_quantize_type(str): the quantization type for weight. - Only support to be 'abs_max' for now. + Only support to be 'abs_max' and 'channel_wise_abs_max'. activation_quantize_type(str): the quantization type for activation. - Only support to be 'abs_max' for now. + Only support to be 'abs_max' and 'moving_average_abs_max'. quantizable_op_type(str): set the op type for quantization. skip_pattern(str|list): The user-defined quantization skip pattern, which will be presented in the name scope of an op. When the skip pattern is @@ -55,28 +79,37 @@ class QuantizeTranspilerV2(object): self._weight_bits = weight_bits self._activation_bits = activation_bits - assert activation_quantize_type == "abs_max", \ - "activation_quantize_type should be abs_max for now." - assert weight_quantize_type == "abs_max", \ - "weight_quantize_type should be abs_max for now." + assert activation_quantize_type in \ + ["abs_max", "moving_average_abs_max"], \ + "activation_quantize_type should be abs_max " \ + "or moving_average_abs_max for now." + assert weight_quantize_type in ["abs_max", "channel_wise_abs_max"], \ + "weight_quantize_type should be abs_max or channel_wise_abs_max." self._activation_quantize_type = activation_quantize_type self._weight_quantize_type = weight_quantize_type + for op_type in quantizable_op_type: + assert op_type in ['conv2d', 'depthwise_conv2d', 'mul'], \ + "Quantize op should be ['conv2d', 'depthwise_conv2d', 'mul']" self._quantizable_ops = quantizable_op_type self._quantizable_grad_ops = [ '%s_grad' % (op) for op in self._quantizable_ops ] self._skip_pattern = skip_pattern - self.helper = LayerHelper(self.__class__.__name__) + self._helper = LayerHelper(self.__class__.__name__) - def apply(self, program, startup_program): + self._moving_rate = 0.9 + self._out_ch_axis1_ops = ['conv2d_transpose', 'mul', 'matmul'] + + def apply(self, program, startup_program, is_test=False): """ Apply quantization to fluid Program. Args: program(Program): the train or test program to be quantized. startup_program(Program): the corresponding startup_program. + is_test(bool): Whethe the program is used for test. Returns: None """ @@ -85,7 +118,7 @@ class QuantizeTranspilerV2(object): assert isinstance(startup_program, Program), \ "startup_program must be the instance of Program" - quant_dequant_vars = [ + var_rename_map = [ collections.OrderedDict() for _ in range(len(program.blocks)) ] with program_guard(program, startup_program): @@ -94,13 +127,104 @@ class QuantizeTranspilerV2(object): for op in ops: if op.type in self._quantizable_ops and \ (not self._is_skip_quant(op)): - self._transform_forward(block, op, quant_dequant_vars) + self._transform_forward(block, op, var_rename_map, + is_test) + for block in program.blocks: ops = list(block.ops) for op in ops: if op.type in self._quantizable_grad_ops and \ (not self._is_skip_quant(op)): - self._transform_backward(block, op, quant_dequant_vars) + self._transform_backward(block, op, var_rename_map) + + def convert(self, test_program, scope=None): + """ + Convert the test program. + Get the out scale from the moving_average_abs_max_scale op and save the + out scale into the quantized op. + Args: + test_program(Program): the test program to be converted. + scope(fluid.Scope, optional): The scope of the program, use it to load + and save variables. If scope=None, get scope by global_scope(). + """ + scope = global_scope() if scope == None else scope + + for block in test_program.blocks: + for op in block.ops: + if op.has_attr("quantization_type") \ + and op.attr("quantization_type") == "qat_with_weight": + # quant op -> var1 -> fake op -> var2 + assert len(op.output_arg_names) == 1 + var1_name = op.output_arg_names[0] + + fake_ops = find_next_ops(block, var1_name) + assert len(fake_ops) == 1 + fake_op = fake_ops[0] + assert fake_op.type == "moving_average_abs_max_scale" + + out_scale_name = fake_op.output("OutScale") + out_threshold = load_variable_data(scope, out_scale_name[0]) + op._set_attr("out_threshold", float(out_threshold)) + + var2_name = fake_op.output("Out")[0] + op._rename_output(var1_name, var2_name) + fake_op._rename_output(var2_name, var1_name) + + def _transform_forward(self, block, op, var_rename_map, is_test): + """ + Insert fake quant op before the target ops. + """ + op._set_attr("quantization_type", "qat_with_weight") + + # insert fake quant op before the quantized op + for in_name in op.input_arg_names: + block_id = block.idx + idx = block.ops.index(op) + + if in_name in var_rename_map[block_id]: + new_in_name = var_rename_map[block_id][in_name] + else: + in_var = block.var(in_name) + if in_var.dtype != core.VarDesc.VarType.FP32: + continue + + quant_bits = self._weight_bits if in_var.persistable \ + else self._activation_bits + quant_type = self._weight_quantize_type if in_var.persistable \ + else self._activation_quantize_type + + if quant_type == "abs_max": + new_var = self._insert_abs_max_fq_op(block, idx, in_var, + quant_bits) + elif quant_type == "moving_average_abs_max": + new_var = self._insert_ma_abs_max_fq_op(block, idx, in_var, + quant_bits, is_test) + elif quant_type == "channel_wise_abs_max": + ch_axis = 1 if op.type in self._out_ch_axis1_ops else 0 + new_var = self._insert_pc_abs_max_fq_op(block, idx, in_var, + quant_bits, ch_axis) + else: + _logger.error("Don't support the quant_type: %s" % + quant_type) + continue + + new_in_name = new_var.name + var_rename_map[block_id][in_name] = new_in_name + + op._rename_input(in_name, new_in_name) + + # insert out scale op followed the quantized op + for out_name in op.output_arg_names: + next_ops = find_next_ops(block, out_name) + + idx = block.ops.index(op) + out_var = block.var(out_name) + new_out_var = self._insert_ma_abs_max_scale_op( + block, idx + 1, out_var, is_test, True) + + for next_op in next_ops: + if "_grad" not in next_op.type: + next_op._rename_input(out_name, new_out_var.name) def _is_skip_quant(self, op): """ @@ -117,49 +241,35 @@ class QuantizeTranspilerV2(object): self._skip_pattern) != -1 return user_skipped - def _transform_forward(self, block, op, quant_dequant_vars): - op._set_attr("quantization_type", "qat_with_weight") - idx = block.ops.index(op) - block_id = block.idx - for in_name in op.input_arg_names: - if in_name in quant_dequant_vars[block_id]: - quant_dequant_var = quant_dequant_vars[block_id][in_name] - else: - in_var = block.var(in_name) - quant_bits = self._weight_bits if in_var.persistable \ - else self._activation_bits - quant_type = self._weight_quantize_type if in_var.persistable \ - else self._activation_quantize_type - if quant_type == "abs_max": - quant_dequant_var = self._insert_quant_dequant_abs_max_op( - block, idx, in_var, quant_bits) - else: - _logger.error("Quant_type only supported to be abs_max") - quant_dequant_vars[block_id][in_name] = quant_dequant_var - op._rename_input(in_name, quant_dequant_var.name) - - def _transform_backward(self, block, op, quant_dequant_vars): + def _transform_backward(self, block, op, var_rename_map): + """ + Update the backword of the target ops. + Note: for the grad ops, only rename the input, skip rename the output. + """ block_id = block.idx no_dequanted_input_vars = True for name in op.input_arg_names: - if name in quant_dequant_vars[block_id]: - dequant_var = quant_dequant_vars[block_id][name] - op._rename_input(name, dequant_var.name) + if name in var_rename_map[block_id]: + new_var_name = var_rename_map[block_id][name] + op._rename_input(name, new_var_name) no_dequanted_input_vars = False if no_dequanted_input_vars: raise ValueError("There is no dequanted inputs for op %s." % (op.type)) - def _insert_quant_dequant_abs_max_op(self, block, idx, in_var, quant_bits): + def _insert_abs_max_fq_op(self, block, idx, in_var, quant_bits): + """ + Inset abs max fake quant op. + """ quant_dequant_var = block.create_var( type=in_var.type, name="{}.quant_dequant".format(in_var.name), shape=in_var.shape, dtype=in_var.dtype) - scale_var = self.helper.create_parameter( + scale_var = self._helper.create_parameter( attr=ParamAttr( name="{}.quant_dequant.scale".format(in_var.name), - initializer=Constant(0.001), + initializer=Constant(0.), trainable=False), shape=[1], dtype=in_var.dtype) @@ -175,3 +285,157 @@ class QuantizeTranspilerV2(object): inputs=inputs, outputs=outputs) return quant_dequant_var + + def _insert_ma_abs_max_fq_op(self, block, idx, in_var, quant_bits, is_test): + """ + Insert moving average abs max fake quant op. + """ + quant_dequant_var = block.create_var( + type=in_var.type, + name="{}.quant_dequant".format(in_var.name), + shape=in_var.shape, + dtype=in_var.dtype) + + scale_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.quant_dequant.scale".format(in_var.name), + initializer=Constant(0.), + trainable=False), + shape=[1], + dtype=in_var.dtype) + scale_var.stop_gradient = True + + if not is_test: + state_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.quant_dequant.state".format(in_var.name), + initializer=Constant(0), + trainable=False), + shape=[1], + dtype=in_var.dtype) + state_var.stop_gradient = True + + accum_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.quant_dequant.accum".format(in_var.name), + initializer=Constant(0), + trainable=False), + shape=[1], + dtype=in_var.dtype) + accum_var.stop_gradient = True + + attrs = { + 'moving_rate': self._moving_rate, + 'bit_length': quant_bits, + 'is_test': is_test + } + inputs = {'X': in_var, 'InScale': scale_var} + outputs = {'Out': quant_dequant_var, 'OutScale': scale_var} + if not is_test: + inputs['InState'] = state_var + inputs['InAccum'] = accum_var + outputs['OutState'] = state_var + outputs['OutAccum'] = accum_var + + block._insert_op( + idx, + type='fake_quantize_dequantize_moving_average_abs_max', + attrs=attrs, + inputs=inputs, + outputs=outputs) + return quant_dequant_var + + def _insert_pc_abs_max_fq_op(self, block, idx, in_var, quant_bits, ch_axis): + """ + Insert per channel abs max fake quant op. + """ + quant_dequant_var = block.create_var( + type=in_var.type, + name="{}.quant_dequant".format(in_var.name), + shape=in_var.shape, + dtype=in_var.dtype) + + scale_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.quant_dequant.scale".format(in_var.name), + initializer=Constant(0.), + trainable=False), + shape=[in_var.shape[ch_axis]], + dtype=in_var.dtype) + scale_var.stop_gradient = True + + inputs = {'X': in_var} + outputs = {'Out': quant_dequant_var, 'OutScale': scale_var} + attrs = {'bit_length': quant_bits, 'quant_axis': ch_axis} + block._insert_op( + idx, + type='fake_channel_wise_quantize_dequantize_abs_max', + attrs=attrs, + inputs=inputs, + outputs=outputs) + return quant_dequant_var + + def _insert_ma_abs_max_scale_op(self, + block, + idx, + in_var, + is_test, + has_out_var=False): + """ + Insert moving average abs max scale op. + """ + scale_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.outscale.scale".format(in_var.name), + initializer=Constant(0.), + trainable=False), + shape=[1], + dtype=in_var.dtype) + scale_var.stop_gradient = True + + attrs = {'moving_rate': self._moving_rate, 'is_test': is_test} + inputs = {'X': in_var} + outputs = {'OutScale': scale_var} + + if not is_test: + state_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.outscale.state".format(in_var.name), + initializer=Constant(0), + trainable=False), + shape=[1], + dtype=in_var.dtype) + state_var.stop_gradient = True + + accum_var = self._helper.create_parameter( + attr=ParamAttr( + name="{}.outscale.accum".format(in_var.name), + initializer=Constant(0), + trainable=False), + shape=[1], + dtype=in_var.dtype) + accum_var.stop_gradient = True + + inputs['InState'] = state_var + inputs['InAccum'] = accum_var + outputs['OutState'] = state_var + outputs['OutAccum'] = accum_var + + if has_out_var: + out_var = block.create_var( + type=in_var.type, + name="{}.tmp".format(in_var.name), + shape=in_var.shape, + dtype=in_var.dtype) + + outputs['Out'] = out_var + + block._insert_op( + idx, + type='moving_average_abs_max_scale', + attrs=attrs, + inputs=inputs, + outputs=outputs) + + if has_out_var: + return out_var diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py index 00f2b597d934ba9467c7f37fcbbac843a4223ac8..aa9f6a1801cbf642d9f420a36e6ac5df3f84d2b6 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py @@ -79,6 +79,7 @@ class TestQuantizeProgramPass(unittest.TestCase): random.seed(0) np.random.seed(0) + # 1 Define program train_program = fluid.Program() startup_program = fluid.Program() test_program = fluid.Program() @@ -93,15 +94,14 @@ class TestQuantizeProgramPass(unittest.TestCase): test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) test_graph.draw('.', 'test_program_1') + # 2 Apply quantization qt = QuantizeTranspilerV2( activation_quantize_type=activation_quant_type, - weight_quantize_type=weight_quant_type, - quantizable_op_type=[ - 'conv2d', 'depthwise_conv2d', 'mul', 'pool2d' - ]) - qt.apply(train_program, startup_program) - qt.apply(test_program, startup_program) + weight_quantize_type=weight_quant_type) + qt.apply(train_program, startup_program, is_test=False) + qt.apply(test_program, startup_program, is_test=True) + # 3 Train place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() @@ -120,28 +120,32 @@ class TestQuantizeProgramPass(unittest.TestCase): build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) - iters = 2 + iters = 5 batch_size = 8 train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): - for _ in range(iters): + for idx in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) - if not for_ci: - print('{}: {}'.format('loss', loss_v)) + if not for_ci and idx % 20 == 0: + print('{}: {}'.format('loss', np.mean(loss_v))) + print('{}: {}'.format('loss', np.mean(loss_v))) + + # 4 Convert + qt.convert(test_program, scope) if not for_ci: with fluid.scope_guard(scope): fluid.io.save_inference_model('./infer_model', ['image', 'label'], [loss], exe, test_program) - def test_quantize_program_gpu(self): + def test_gpu_1(self): if fluid.core.is_compiled_with_cuda(): self.quantize_program( use_cuda=True, @@ -150,7 +154,16 @@ class TestQuantizeProgramPass(unittest.TestCase): weight_quant_type='abs_max', for_ci=True) - def test_quantize_program_cpu(self): + def test_gpu_2(self): + if fluid.core.is_compiled_with_cuda(): + self.quantize_program( + use_cuda=True, + seed=1, + activation_quant_type='moving_average_abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) + + def test_cpu_1(self): self.quantize_program( use_cuda=False, seed=2, @@ -158,6 +171,14 @@ class TestQuantizeProgramPass(unittest.TestCase): weight_quant_type='abs_max', for_ci=True) + def test_cpu_2(self): + self.quantize_program( + use_cuda=False, + seed=2, + activation_quant_type='moving_average_abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) + if __name__ == '__main__': unittest.main()