diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index d52e3ea10459d4e2488aba32bf06f88ee6eccfca..f262ace3dc6527be3358b74e7fc2736817e3a302 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -37,7 +37,10 @@ def _load_variable_data(scope, var_name): ''' Load variable value from scope ''' - return np.array(scope.find_var(var_name).get_tensor()) + var_node = scope.find_var(var_name) + assert var_node is not None, \ + "Cannot find " + var_name + " in scope." + return np.array(var_node.get_tensor()) def _set_variable_data(scope, place, var_name, np_value): @@ -53,6 +56,12 @@ def _set_variable_data(scope, place, var_name, np_value): class PostTrainingQuantization(object): + """ + Utilizing post training quantization methon to quantize the FP32 model, + and it uses calibrate data to get the quantization information for all + quantized variables. + """ + def __init__(self, executor=None, scope=None, @@ -70,13 +79,10 @@ class PostTrainingQuantization(object): is_use_cache_file=False, cache_dir="./temp_post_training"): ''' - The class utilizes post training quantization methon to quantize the - fp32 model. It uses calibrate data to calculate the scale factor of - quantized variables, and inserts fake quant/dequant op to obtain the - quantized model. + Constructor. Args: - executor(fluid.Executor): The executor to load, run and save the + executor(fluid.Executor): The executor to load, run and save the quantized model. scope(fluid.Scope, optional): The scope of the program, use it to load and save variables. If scope=None, get scope by global_scope(). @@ -96,9 +102,11 @@ class PostTrainingQuantization(object): batch_nums(int, optional): If batch_nums is not None, the number of calibrate data is batch_size*batch_nums. If batch_nums is None, use all data provided by sample_generator as calibrate data. - algo(str, optional): If algo=KL, use KL-divergenc method to - get the more precise scale factor. If algo='direct', use - abs_max methon to get the scale factor. Default is KL. + algo(str, optional): If algo='KL', use KL-divergenc method to + get the KL threshold for quantized activations and get the abs_max + value for quantized weights. If algo='abs_max', get the abs max + value for activations and weights. If algo= 'min_max', get the min + and max value for quantized activations and weights. Default is KL. quantizable_op_type(list[str], optional): List the type of ops that will be quantized. Default is ["conv2d", "depthwise_conv2d", "mul"]. @@ -158,7 +166,9 @@ class PostTrainingQuantization(object): assert executor is not None, "The executor cannot be None." assert model_dir is not None, "The model_dir cannot be None." assert sample_generator is not None, \ - "The sample_generator cannot be None." + "The sample_generator cannot be None." + assert algo in ['KL', 'abs_max', 'min_max'], \ + "The algo should be KL, abs_max or min_max." self._executor = executor self._scope = global_scope() if scope == None else scope @@ -182,8 +192,7 @@ class PostTrainingQuantization(object): else: self._quantizable_op_type = quantizable_op_type for op_type in self._quantizable_op_type: - assert op_type in supported_quantizable_op_type + \ - AddQuantDequantPass._activation_type, \ + assert op_type in supported_quantizable_op_type, \ op_type + " is not supported for quantization." self._place = self._executor.place @@ -197,20 +206,25 @@ class PostTrainingQuantization(object): self._quantized_weight_var_name = set() self._quantized_act_var_name = set() self._sampling_data = {} - self._quantized_var_scale_factor = {} + self._quantized_var_kl_threshold = {} + self._quantized_var_min = {} + self._quantized_var_max = {} + self._quantized_var_abs_max = {} def quantize(self): ''' - Quantize the fp32 model. Use calibrate data to calculate the scale factor of - quantized variables, and inserts fake quant/dequant op to obtain the - quantized model. + Load the FP32 model, and use the calibrate data to calculate the forward-stage. + Based on the sample data, we can get the quantization information, and obtain + the final quantized model. Args: None Returns: the program of quantized model. ''' - self._preprocess() + self._load_model_data() + self._collect_quantized_varnames() + self._set_activation_persistable() batch_id = 0 for data in self._data_loader(): @@ -218,22 +232,29 @@ class PostTrainingQuantization(object): feed=data, fetch_list=self._fetch_list, return_numpy=False) - self._sample_data(batch_id) + if self._algo == "KL": + self._sample_data(batch_id) + else: + self._sample_threshold() if batch_id % 5 == 0: - _logger.info("run batch: " + str(batch_id)) + _logger.info("Run batch: " + str(batch_id)) batch_id += 1 if self._batch_nums and batch_id >= self._batch_nums: break - _logger.info("all run batch: " + str(batch_id)) + _logger.info("Finish all batch: " + str(batch_id)) - _logger.info("calculate scale factor ...") - self._calculate_scale_factor() + self._reset_activation_persistable() - _logger.info("update the program ...") - self._update_program() + if self._algo == "KL": + self._calculate_kl_threshold() - self._save_output_scale() + if self._algo in ["KL", "abs_max"]: + self._update_program() + else: + self._save_input_threhold() + + self._save_output_threshold() return self._program def save_quantized_model(self, save_model_path): @@ -252,12 +273,11 @@ class PostTrainingQuantization(object): executor=self._executor, main_program=self._program) - def _preprocess(self): + def _load_model_data(self): ''' - Load model and set data loader, collect the variable names for sampling, - and set activation variables to be persistable. + Load model and set data loader. ''' - # load model and set data loader + _logger.info("Load model and set data loader ...") [self._program, self._feed_list, self._fetch_list] = \ io.load_inference_model(dirname=self._model_dir, executor=self._executor, @@ -273,7 +293,12 @@ class PostTrainingQuantization(object): drop_last=True, places=self._place) - # collect the variable names for sampling. + def _collect_quantized_varnames(self): + ''' + Collect the variable names for sampling, and set activation + variables to be persistable. + ''' + _logger.info("Collect quantized variable names ...") # TODO(juncaipeng), consider the name_scope of skip_quant and # reduce the variables for sampling persistable_var_names = [] @@ -284,46 +309,109 @@ class PostTrainingQuantization(object): for op in self._program.global_block().ops: op_type = op.type if op_type in self._quantizable_op_type: - if op_type in ("conv2d", "depthwise_conv2d"): - self._quantized_act_var_name.add(op.input("Input")[0]) - self._quantized_weight_var_name.add(op.input("Filter")[0]) - self._quantized_act_var_name.add(op.output("Output")[0]) - elif op_type in ["mul", "matmul"]: - x_var_name = op.input("X")[0] - if x_var_name in persistable_var_names: - self._quantized_weight_var_name.add(x_var_name) - else: - self._quantized_act_var_name.add(x_var_name) - y_var_name = op.input("Y")[0] - if y_var_name in persistable_var_names: - self._quantized_weight_var_name.add(y_var_name) - else: - self._quantized_act_var_name.add(y_var_name) - self._quantized_act_var_name.add(op.output("Out")[0]) - else: - # process other quantizable op type, the input must all not persistable - if self._is_input_all_not_persistable( - op, persistable_var_names): - input_output_name_list = self._op_real_in_out_name[ - op_type] - for input_name in input_output_name_list[0]: - for var_name in op.input(input_name): - self._quantized_act_var_name.add(var_name) - for output_name in input_output_name_list[1]: - for var_name in op.output(output_name): - self._quantized_act_var_name.add(var_name) - - # set activation variables to be persistable, so can obtain - # the tensor data in sample_data + name_list = self._op_real_in_out_name[op_type] + for input_name in name_list[0]: + for var_name in op.input(input_name): + if var_name in persistable_var_names: + self._quantized_weight_var_name.add(var_name) + else: + self._quantized_act_var_name.add(var_name) + for output_name in name_list[1]: + for var_name in op.output(output_name): + if var_name in persistable_var_names: + self._quantized_weight_var_name.add(var_name) + else: + self._quantized_act_var_name.add(var_name) + + def _set_activation_persistable(self): + ''' + Set activation variables to be persistable, so can obtain + the tensor data in sample_data + ''' + persistable_var_names = [] + for var in self._program.list_vars(): + if var.persistable: + persistable_var_names.append(var.name) for var in self._program.list_vars(): if var.name in self._quantized_act_var_name: var.persistable = True + def _reset_activation_persistable(self): + ''' + Reset activations to be not persistable. + ''' + for var in self._program.list_vars(): + if var.name in self._quantized_act_var_name: + var.persistable = False + + def _sample_threshold(self): + ''' + Sample the input threshold(min, max, or abs_max) in every iterations. + ''' + assert self._algo in ["abs_max", "min_max"], \ + "The algo should be abs_max or min_max to sample min max value." + if self._algo == "abs_max": + # Only calculate abs_max value for weight for once + if self._quantized_var_abs_max == {}: + for var_name in self._quantized_weight_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + abs_max_per_channel = [] + for i in range(var_tensor.shape[0]): + abs_max_per_channel.append( + float(np.max(np.abs(var_tensor[i])))) + self._quantized_var_abs_max[var_name] = abs_max_per_channel + for var_name in self._quantized_act_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + abs_max_value = float(np.max(np.abs(var_tensor))) + if (var_name not in self._quantized_var_abs_max) or \ + (abs_max_value > self._quantized_var_abs_max[var_name]): + self._quantized_var_abs_max[var_name] = abs_max_value + elif self._algo == "min_max": + if self._quantized_var_min == {} and self._quantized_var_max == {}: + for var_name in self._quantized_weight_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + min_per_channel = [] + max_per_channle = [] + for i in range(var_tensor.shape[0]): + min_per_channel.append(float(np.min(var_tensor[i]))) + max_per_channle.append(float(np.max(var_tensor[i]))) + self._quantized_var_min[var_name] = min_per_channel + self._quantized_var_max[var_name] = max_per_channle + for var_name in self._quantized_act_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + min_value = float(np.min(var_tensor)) + max_value = float(np.max(var_tensor)) + if (var_name not in self._quantized_var_min) or \ + (min_value < self._quantized_var_min[var_name]): + self._quantized_var_min[var_name] = min_value + if (var_name not in self._quantized_var_max) or \ + (max_value > self._quantized_var_max[var_name]): + self._quantized_var_max[var_name] = max_value + + def _save_input_threhold(self): + ''' + Save input threshold to the quantized op. + ''' + assert self._algo == "min_max", \ + "The algo should be min_max to save input threshold." + for op in self._program.global_block().ops: + if op.type in self._quantizable_op_type: + input_name_list = self._op_real_in_out_name[op.type][0] + for input_name in input_name_list: + for var_name in op.input(input_name): + assert var_name in self._quantized_var_min + assert var_name in self._quantized_var_max + op._set_attr(var_name + ".min", + self._quantized_var_min[var_name]) + op._set_attr(var_name + ".max", + self._quantized_var_max[var_name]) + def _sample_data(self, iter): ''' Sample the tensor data of quantized variables, applied in every iteration. ''' + assert self._algo == "KL", "The algo should be KL to sample data." for var_name in self._quantized_weight_var_name: if var_name not in self._sampling_data: var_tensor = _load_variable_data(self._scope, var_name) @@ -344,19 +432,20 @@ class PostTrainingQuantization(object): var_tensor = var_tensor.ravel() self._sampling_data[var_name].append(var_tensor) - def _calculate_scale_factor(self): + def _calculate_kl_threshold(self): ''' - Calculate the scale factor of quantized variables. + Calculate the KL threshold of quantized variables. ''' + _logger.info("Calculate KL threshold ...") + assert self._algo == "KL", "The algo should be KL to calculate kl threshold." # apply channel_wise_abs_max quantization for weights for var_name in self._quantized_weight_var_name: data = self._sampling_data[var_name] - scale_factor_per_channel = [] + threshold_per_channel = [] for i in range(data.shape[0]): abs_max_value = np.max(np.abs(data[i])) - scale_factor_per_channel.append(abs_max_value) - self._quantized_var_scale_factor[ - var_name] = scale_factor_per_channel + threshold_per_channel.append(abs_max_value) + self._quantized_var_kl_threshold[var_name] = threshold_per_channel # apply kl quantization for activation if self._is_use_cache_file: @@ -369,36 +458,25 @@ class PostTrainingQuantization(object): sampling_data.append(np.load(file_path)) os.remove(file_path) sampling_data = np.concatenate(sampling_data) - - if self._algo == "KL": - self._quantized_var_scale_factor[var_name] = \ - self._get_kl_scaling_factor(np.abs(sampling_data)) - else: - self._quantized_var_scale_factor[var_name] = \ - np.max(np.abs(sampling_data)) + self._quantized_var_kl_threshold[var_name] = \ + self._get_kl_scaling_factor(np.abs(sampling_data)) else: for var_name in self._quantized_act_var_name: self._sampling_data[var_name] = np.concatenate( self._sampling_data[var_name]) - if self._algo == "KL": - self._quantized_var_scale_factor[var_name] = \ - self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name])) - else: - self._quantized_var_scale_factor[var_name] = \ - np.max(np.abs(self._sampling_data[var_name])) + self._quantized_var_kl_threshold[var_name] = \ + self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name])) def _update_program(self): ''' - Insert fake_quantize/fake_dequantize op to the program. + Use QuantizationTransformPass and AddQuantDequantPass to insert + fake_quantize, fake_dequantize and fake_quant_dequant op. + Besides, save all kl threshold to the scale var node. ''' - # reset quantized activation variable - for var in self._program.list_vars(): - if var.name in self._quantized_act_var_name: - var.persistable = False - - # use QuantizationTransformPass to insert fake_quantize/fake_dequantize op + _logger.info("Update the program ...") graph = IrGraph(core.Graph(self._program.desc), for_test=True) + # use QuantizationTransformPass to insert fake_quant/fake_dequantize op major_quantizable_op_types = [] for op_type in QuantizationTransformPass._supported_quantizable_op_type: if op_type in self._quantizable_op_type: @@ -424,8 +502,12 @@ class PostTrainingQuantization(object): quantizable_op_type=minor_quantizable_op_types) add_quant_dequant_pass.apply(graph) - # save scale factor to scale var node - for key, val in self._quantized_var_scale_factor.items(): + # save abs_max or KL threshold to scale var node + if self._algo == "KL": + scale_dict = self._quantized_var_kl_threshold + else: + scale_dict = self._quantized_var_abs_max + for key, val in scale_dict.items(): _set_variable_data( self._scope, self._place, @@ -450,33 +532,34 @@ class PostTrainingQuantization(object): freeze_pass.apply(graph) self._program = graph.to_program() - def _save_output_scale(self): + def _save_output_threshold(self): ''' - Save output scale to the quantized op. + Save output threshold to the quantized op. ''' - output_scale_name = "output_scale" for op in self._program.global_block().ops: if op.type in self._quantizable_op_type: output_name_list = self._op_real_in_out_name[op.type][1] for output_name in output_name_list: - for output_var_name in op.output(output_name): - if output_var_name in self._quantized_var_scale_factor: - op._set_attr(output_scale_name, - self._quantized_var_scale_factor[ - output_var_name]) - - def _is_input_all_not_persistable(self, op, persistable_var_names): - ''' - Analyze the real inputs of the op are all not persistable. - ''' - is_input_all_not_persistable = True - input_name_list = self._op_real_in_out_name[op.type][0] - for input_name in input_name_list: - for var_name in op.input(input_name): - if var_name in persistable_var_names: - is_input_all_not_persistable = False - break - return is_input_all_not_persistable + for var_name in op.output(output_name): + if self._algo == "KL": + assert var_name in self._quantized_var_kl_threshold + op._set_attr( + var_name + ".threshold", + self._quantized_var_kl_threshold[var_name]) + op._set_attr("quantization_type", "post_kl") + elif self._algo == "abs_max": + assert var_name in self._quantized_var_abs_max + op._set_attr(var_name + ".threshold", + self._quantized_var_abs_max[var_name]) + op._set_attr("quantization_type", "post_abs_max") + elif self._algo == "min_max": + assert var_name in self._quantized_var_min + assert var_name in self._quantized_var_max + op._set_attr(var_name + ".min", + self._quantized_var_min[var_name]) + op._set_attr(var_name + ".max", + self._quantized_var_max[var_name]) + op._set_attr("quantization_type", "post_min_max") def _get_kl_scaling_factor(self, activation_blob, num_quantized_bins=255): ''' diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index fa6a6e60ae36c84f940cd36c68660a221fcbd75e..213033f883c283112a234d7f9dcb448b5680e6da 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -35,6 +35,10 @@ _fake_dequant_op_list = [ 'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs' ] +_fake_quant_dequant_op_list = [ + 'fake_quantize_dequantize_moving_average_abs_max' +] + _out_scale_op_list = [ "mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "elementwise_add", "elementwise_mul", @@ -44,7 +48,7 @@ _out_scale_op_list = [ # list op real input and output names, to avoid processing input such as AxisTensor. _op_real_in_out_name = { "conv2d": [["Input", "Filter"], ["Output"]], - "depthwise_conv2d": [["Input"], ["Output"]], + "depthwise_conv2d": [["Input", "Filter"], ["Output"]], "mul": [["X", "Y"], ["Out"]], "matmul": [["X", "Y"], ["Out"]], "pool2d": [["X"], ["Out"]], @@ -236,6 +240,7 @@ class QuantizationTransformPass(object): op_node.op()._set_attr("skip_quant", True) def _transform_forward(graph, op): + op.op()._set_attr("quantization_type", "qat_with_weight") for var_node in op.inputs: if var_node.name() not in op.input_arg_names(): continue @@ -290,7 +295,7 @@ class QuantizationTransformPass(object): # The loop for transforming the forward graph: for op in ops: if op.name() in self._quantizable_ops: - if not QuantizationTransformPass._is_skip_quant(graph, op): + if not self._is_skip_quant(graph, op): _transform_forward(graph, op) # The loop for renaming the inputs of backward op. for op in ops: @@ -636,8 +641,7 @@ class QuantizationTransformPass(object): """ return "%s.scale" % (var_name) - @staticmethod - def _is_skip_quant(graph, op_node): + def _is_skip_quant(self, graph, op_node): """ Analyse whether the op node skips quantization. """ @@ -650,20 +654,20 @@ class QuantizationTransformPass(object): if op_node.name() in ["mul", "matmul"] and \ _is_input_all_not_persistable(graph, op_node): is_skip = True + if op_node.op().has_attr("quantization_type") and \ + op_node.op().attr("quantization_type") == "qat_without_weight": + is_skip = True return is_skip class QuantizationFreezePass(object): - _supported_quantizable_op_type = \ - QuantizationTransformPass._supported_quantizable_op_type - def __init__(self, scope, place, weight_bits=8, activation_bits=8, weight_quantize_type='abs_max', - quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']): + quantizable_op_type=None): """ The freeze pass is used to adjust the quantize operator order, for example: 1) `activation -> quant -> dequant -> conv2d` will be frozen into @@ -679,9 +683,8 @@ class QuantizationFreezePass(object): weight_quantize_type(str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. - quantizable_op_type(list[str]): List the type of ops that will be quantized. - Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in - QuantizationTransformPass and ConvertToInt8Pass must be the same as this. + quantizable_op_type(list[str]): This input param will be removed latter. The pass + will process all quantized op, so it is not necessary to set the input param. """ assert scope is not None, \ 'The scope cannot be set None.' @@ -692,16 +695,12 @@ class QuantizationFreezePass(object): self._weight_bits = weight_bits self._activation_bits = activation_bits self._weight_quantize_type = weight_quantize_type - self._quantizable_ops = quantizable_op_type - for op in self._quantizable_ops: - assert op in QuantizationFreezePass._supported_quantizable_op_type, \ - op + " is not supported for quantization." self._conv_ops = ['conv2d', 'depthwise_conv2d'] self._fake_quant_op_names = _fake_quant_op_list self._fake_dequant_op_names = _fake_dequant_op_list self._op_input_rename_map = collections.OrderedDict() self._op_output_rename_map = collections.OrderedDict() - self._var_scale_map = collections.OrderedDict() + self._quant_var_scale_map = collections.OrderedDict() def apply(self, graph): """ @@ -712,6 +711,7 @@ class QuantizationFreezePass(object): Returns: None """ + # Get input scales in fake quant op and process weights persistable_vars = [p.name() for p in graph.all_persistable_nodes()] ops = graph.all_op_nodes() for op_node in ops: @@ -733,7 +733,7 @@ class QuantizationFreezePass(object): else: scale_v = self._load_var( op_node.output('OutScale')[0])[0] - self._var_scale_map[input_arg_name] = scale_v + self._quant_var_scale_map[input_arg_name] = scale_v self._remove_fake_quant_and_dequant_op(graph, op_node) # quantize weight and restore param_v = self._load_var(input_arg_name) @@ -743,32 +743,29 @@ class QuantizationFreezePass(object): else: scale_v = graph._find_node_by_name( op_node.outputs, op_node.output('OutScale')[0]) - self._var_scale_map[input_arg_name] = scale_v + self._quant_var_scale_map[input_arg_name] = scale_v + # Remove all fake dequant op ops = graph.all_op_nodes() for op_node in ops: op_name = op_node.name() if op_name in self._fake_dequant_op_names: self._remove_fake_quant_and_dequant_op(graph, op_node) + # Insert post dequant op ops = graph.all_op_nodes() for op_node in ops: - op_name = op_node.name() - if op_name in self._quantizable_ops: - # only process the node that is quantized by QuantizationTransformPass - is_op_node_quantized = False - for var_node in op_node.inputs: - var_name = var_node.name() - if var_name.endswith('.dequantized'): - is_op_node_quantized = True - if is_op_node_quantized: - if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops: - self._insert_post_channel_dequant_op(graph, op_node) - else: - self._insert_post_dequant_op(graph, op_node) + op_node_desc = op_node.op() + if op_node_desc.has_attr("quantization_type") and \ + op_node_desc.attr("quantization_type") == "qat_with_weight": + if self._weight_quantize_type == 'channel_wise_abs_max' \ + and op_node.name() in self._conv_ops: + self._insert_post_channel_dequant_op(graph, op_node) + else: + self._insert_post_dequant_op(graph, op_node) + # Rename inputs of the followed ops after inserting dequant_op after fc/conv for op_node in ops: - # insert dequant_op after fc/conv, need to rename inputs of the followed ops for var_node in op_node.inputs: if var_node.node in self._op_output_rename_map: old_in = var_node @@ -802,7 +799,7 @@ class QuantizationFreezePass(object): new_in.clear_outputs() graph.update_input_link(old_in, new_in, op_node) original_var_name = self._original_var_name(name) - scale_v = self._var_scale_map[original_var_name] + scale_v = self._quant_var_scale_map[original_var_name] if original_var_name in persistable_vars: assert isinstance( scale_v, @@ -811,7 +808,7 @@ class QuantizationFreezePass(object): channel_scale = np.array(scale_v) else: assert isinstance(scale_v, IrNode) - scale_var_node = self._var_scale_map[original_var_name] + scale_var_node = self._quant_var_scale_map[original_var_name] if len(op_node.output_arg_names()) != 1: raise ValueError("Only support one output, but op %s has" @@ -867,7 +864,7 @@ class QuantizationFreezePass(object): new_in.clear_outputs() graph.update_input_link(old_in, new_in, op_node) original_var_name = self._original_var_name(name) - scale_v = self._var_scale_map[original_var_name] + scale_v = self._quant_var_scale_map[original_var_name] if original_var_name in persistable_vars: assert self._is_float( scale_v), 'The scale of parameter %s is not a float.' % ( @@ -876,7 +873,7 @@ class QuantizationFreezePass(object): else: max_range *= act_range assert isinstance(scale_v, IrNode) - scale_var_node = self._var_scale_map[original_var_name] + scale_var_node = self._quant_var_scale_map[original_var_name] if len(op_node.output_arg_names()) != 1: raise ValueError("Only support one output, but op %s has" @@ -963,13 +960,7 @@ class QuantizationFreezePass(object): class ConvertToInt8Pass(object): - _supported_quantizable_op_type = \ - QuantizationTransformPass._supported_quantizable_op_type - - def __init__(self, - scope, - place, - quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']): + def __init__(self, scope, place, quantizable_op_type=None): """ Convert the weights into int8_t type. @@ -977,9 +968,8 @@ class ConvertToInt8Pass(object): scope(fluid.Scope): scope is used to get the weight tensor values. place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the 8bits weight tensors. - quantizable_op_type(list[str]): List the type of ops that will be quantized. - Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in - QuantizationTransformPass and QuantizationFreezePass must be the same as this. + quantizable_op_type(list[str]): This input param will be removed latter. The pass + will process all quantized op, so it is not necessary to set the input param. """ assert scope is not None, \ 'The scope cannot be set None.' @@ -987,10 +977,6 @@ class ConvertToInt8Pass(object): 'The place cannot be set None.' self._scope = scope self._place = place - self._quantizable_ops = quantizable_op_type - for op in self._quantizable_ops: - assert op in ConvertToInt8Pass._supported_quantizable_op_type, \ - op + " is not supported for quantization." def apply(self, graph): """ @@ -1006,10 +992,8 @@ class ConvertToInt8Pass(object): ops = graph.all_op_nodes() input_map = {} for op_node in ops: - op_name = op_node.name() - if op_name in self._quantizable_ops: - if QuantizationTransformPass._is_skip_quant(graph, op_node): - continue + if op_node.op().has_attr("quantization_type") and \ + op_node.op().attr("quantization_type") == "qat_with_weight": for var_node in op_node.inputs: name = var_node.name() if name in persistable_vars: @@ -1259,9 +1243,9 @@ class AddQuantDequantPass(object): "equal", "gather", "greater_equal", "greater_than", "less_equal", "less_than", "mean", "not_equal", "reshape", "reshape2", "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", - "squeeze", "elementwise_sub", "mul", "matmul" + "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", + "leaky_relu", "tanh", "swish" ] - _activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"] def __init__(self, scope=None, @@ -1307,8 +1291,7 @@ class AddQuantDequantPass(object): else: self._quantizable_op_type = quantizable_op_type for op_type in quantizable_op_type: - assert op_type in AddQuantDequantPass._supported_quantizable_op_type + \ - AddQuantDequantPass._activation_type, \ + assert op_type in AddQuantDequantPass._supported_quantizable_op_type, \ op_type + " is not supported for quantization." self._quantizable_grad_op_type = [ '%s_grad' % (op) for op in self._quantizable_op_type @@ -1343,17 +1326,15 @@ class AddQuantDequantPass(object): elif isinstance(self._skip_pattern, str): is_skip = op_node.op().has_attr("op_namescope") and \ op_node.op().attr("op_namescope").find(self._skip_pattern) != -1 - - is_op_node_quantized = False - for var_node in op_node.inputs: - var_name = var_node.name() - if var_name.endswith('.dequantized'): - is_op_node_quantized = True - - if is_skip or is_op_node_quantized or \ + is_quantized = op_node.op().has_attr("quantization_type") and \ + op_node.op().attr("quantization_type") == "qat_with_weight" + if is_skip or is_quantized or \ (not _is_input_all_not_persistable(graph, op_node)): continue + op_node.op()._set_attr("quantization_type", + "qat_without_weight") + op_node.op()._set_attr("activation_bits", self._quant_bits) input_name_list = _op_real_in_out_name[op_node.name()][0] arg_names = [] for input_name in input_name_list: diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py index e1be7c6809d4ac0c0d2a622a55161cfcca894f42..45140aec4e5f6159a16b52a22bda3e79dd3e3c60 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py @@ -264,7 +264,7 @@ class TestPostTrainingQuantization(unittest.TestCase): ptq.save_quantized_model(self.int8_model) def run_test(self, model, algo, data_urls, data_md5s, quantizable_op_type, - is_full_quantize, is_use_cache_file): + is_full_quantize, is_use_cache_file, diff_threshold): infer_iterations = self.infer_iterations batch_size = self.batch_size sample_iterations = self.sample_iterations @@ -296,11 +296,11 @@ class TestPostTrainingQuantization(unittest.TestCase): sys.stdout.flush() delta_value = fp32_acc1 - int8_acc1 - self.assertLess(delta_value, 0.025) + self.assertLess(delta_value, diff_threshold) -class TestPostTrainingForMobilenetv1(TestPostTrainingQuantization): - def test_post_training_mobilenetv1(self): +class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization): + def test_post_training_kl_mobilenetv1(self): model = "MobileNet-V1" algo = "KL" data_urls = [ @@ -310,10 +310,29 @@ class TestPostTrainingForMobilenetv1(TestPostTrainingQuantization): quantizable_op_type = [ "conv2d", "depthwise_conv2d", "mul", "pool2d", "elementwise_add" ] - is_full_quantize = True + is_full_quantize = False is_use_cache_file = False + diff_threshold = 0.025 self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type, - is_full_quantize, is_use_cache_file) + is_full_quantize, is_use_cache_file, diff_threshold) + + +class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization): + def test_post_training_abs_max_mobilenetv1(self): + model = "MobileNet-V1" + algo = "abs_max" + data_urls = [ + 'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + ] + data_md5s = ['13892b0716d26443a8cdea15b3c6438b'] + quantizable_op_type = [ + "conv2d", "depthwise_conv2d", "mul", "pool2d", "elementwise_add" + ] + is_full_quantize = False + is_use_cache_file = False + diff_threshold = 0.05 + self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type, + is_full_quantize, is_use_cache_file, diff_threshold) if __name__ == '__main__': diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py index 93d84112524e7e302ec22f99354e6169c512800e..373a65018800a52d8d8de5373ad95dde21001614 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py @@ -20,7 +20,7 @@ from test_post_training_quantization_mobilenetv1 import TestPostTrainingQuantiza class TestPostTrainingForResnet50(TestPostTrainingQuantization): def test_post_training_resnet50(self): model = "ResNet-50" - algo = "direct" + algo = "min_max" data_urls = [ 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz' ] @@ -28,8 +28,9 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization): quantizable_op_type = ["conv2d", "mul"] is_full_quantize = False is_use_cache_file = False + diff_threshold = 0.025 self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type, - is_full_quantize, is_use_cache_file) + is_full_quantize, is_use_cache_file, diff_threshold) if __name__ == '__main__':