diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py index ad62006ddbcb5977672489837cca6dee2aef9687..ce409b0dbfc6b83b8074f198880d85518a633fc6 100644 --- a/python/paddle/fluid/contrib/slim/quantization/__init__.py +++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py @@ -20,12 +20,15 @@ from . import quantization_strategy from .quantization_strategy import * from . import mkldnn_post_training_strategy from .mkldnn_post_training_strategy import * -from . import quantization_mkldnn_pass -from .quantization_mkldnn_pass import * +from . import qat_int8_mkldnn_pass +from .qat_int8_mkldnn_pass import * +from . import qat2_int8_mkldnn_pass +from .qat2_int8_mkldnn_pass import * from . import post_training_quantization from .post_training_quantization import * __all__ = quantization_pass.__all__ + quantization_strategy.__all__ __all__ += mkldnn_post_training_strategy.__all__ -__all__ += quantization_mkldnn_pass.__all__ +__all__ += qat_int8_mkldnn_pass.__all__ +__all__ += qat2_int8_mkldnn_pass.__all__ __all__ += post_training_quantization.__all__ diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py similarity index 62% rename from python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py rename to python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py index 1d44db72ea27d23f9bc55a1e0e64baa2e1baa307..021f188311f67867a7fa39039130209a13a5f7e8 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py @@ -17,263 +17,7 @@ from .... import core from ....framework import IrGraph from ....framework import IrNode -__all__ = ['QatInt8MkldnnPass', 'Qat2Int8MkldnnPass'] - - -class QatInt8MkldnnPass(object): - """ - Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8 - IrGraph. Following transformations did in this pass: - 1. Convert int8 range weights with float32 data type, which are generated by - the QuantizationFreezePass, to float32 range weights with float32 data type - by using the corresponding scales. This conversion is because MKL-DNN INT8 - conv2d kernel and mul kernel now only support float32 weights input, hence - weights quantization will happen inside the conv2d and mul INT8 kernel. - 2. Create the new conv2d or mul op with the converted weights and link its output - to fake_dequantize_abs_max op's output and set conv2d's attribute "force_fp32 - _output" as true - 3. Transform fake_quantize_xx op to quantize op - 4. Remove fake_dequantize_abs_max op - """ - - def __init__(self, _scope=None, _place=None): - """ - Args: - scope(fluid.Scope): scope is used to initialize the new parameters. - place(fluid.CPUPlace): place is used to initialize the new parameters. - - - Examples: - .. code-block:: python - # The original graph will be rewrite. - import paddle.fluid as fluid - from paddle.fluid.contrib.slim.quantization \ - import QatInt8MkldnnPass - from paddle.fluid.framework import IrGraph - from paddle.fluid import core - - graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False) - place = fluid.CPUPlace() - mkldnn_pass = QatInt8MkldnnPass(fluid.global_scope(), - place) - mkldnn_pass.apply(graph) - """ - - self._scope = _scope - self._place = _place - - self._quantize_type = [ - 'fake_quantize_moving_average_abs_max', - 'fake_quantize_range_abs_max' - ] - self._dequantize_type = ['fake_dequantize_max_abs'] - self._quantize_dequantize_type = [ - 'fake_quantize_dequantize_moving_average_abs_max' - ] - - self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] - self._conv_ops = ['conv2d', 'depthwise_conv2d'] - self._pool_ops = ['pool2d'] - - self._in_scale = {} - self._max_range = {} - self._new_output = {} - self._s8_max = 127 - - def apply(self, graph): - """ - Quantize the graph for running MKL-DNN INT8 inference. According - to activation quantization type, the graph will transform fake - quantize ops to quantize ops and remove the fake dequantize ops. - - Args: - graph(IrGraph): the applied graph. - """ - - assert isinstance(graph, - IrGraph), 'graph must be the instance of IrGraph.' - ops = graph.all_op_nodes() - - persistable_vars = [p.name() for p in graph.all_persistable_nodes()] - # Collect the _in_scales and _max_range to calculate the new scales for MKL-DNN - # INT8 conv2d and mul - for op_node in ops: - if op_node.name() in self._dequantize_type: - input_name = op_node.input("X")[0] - scale_name = op_node.input("Scale")[0] - self._in_scale[input_name] = self._load_param(self._scope, - scale_name)[0] - self._max_range[input_name] = op_node.op().attr("max_range") - self._new_output[input_name] = op_node.output("Out")[0] - - if op_node.name() in self._quantize_dequantize_type: - inputs = op_node.op().input_names() - attrs = op_node.op().attr_names() - input_name = op_node.input("X")[0] - scale_name = op_node.input("InScale")[0] - self._in_scale[input_name] = self._load_param(self._scope, - scale_name)[0] - # self._max_range[input_name] = op_node.op().attr("max_range") - self._new_output[input_name] = op_node.output("Out")[0] - - for op_node in ops: - if op_node.name() in self._quantizable_ops: - if op_node.name() in self._conv_ops: - self._transform_to_conv_mkldnn(graph, op_node) - elif op_node.name() in self._pool_ops: - self._transform_to_pool_mkldnn(graph, op_node) - else: - self._transform_to_mul_mkldnn(graph, op_node) - elif op_node.name() in self._quantize_type: - self._transform_to_quantize_mkldnn(graph, op_node) - elif op_node.name() in self._dequantize_type: - self._remove_fake_dequantize_op(graph, op_node) - self._remove_unused_var_nodes(graph) - return graph - - def _transform_to_pool_mkldnn(self, graph, op): - output_name = op.output("Out")[0] - input_name = op.input("X")[0] - - def _transform_to_conv_mkldnn(self, graph, op_node): - weight_name = op_node.input("Filter")[0] - output_name = op_node.output("Output")[0] - # Convert int8 range weights to fp32 range weights - weight = self._load_param(self._scope, weight_name) - w_fp32 = np.divide( - np.multiply(weight, self._s8_max), self._max_range[output_name]) - w_fp32 = w_fp32.reshape(weight.shape) - self._restore_var(weight_name, w_fp32) - input_var_node = graph._find_node_by_name(op_node.inputs, - op_node.input("Input")[0]) - weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name) - - # Set fake_dequantize_abs_max's output as new output of conv2d - output_var_node = graph._find_node_by_name( - graph.all_var_nodes(), self._new_output[output_name]) - attrs = { - name: op_node.op().attr(name) - for name in op_node.op().attr_names() - } - - conv_op_node = graph.create_op_node( - op_type='conv2d', - attrs=attrs, - inputs={'Input': input_var_node, - 'Filter': weight_var_node}, - outputs={'Output': output_var_node}) - - # Based on the QAT's scales to calculate the scales of MKL-DNN INT8 conv2d - scale_in = self._s8_max / self._in_scale[output_name] - scale_w = [] - scale_w = [self._max_range[output_name] / self._s8_max] - - conv_op_node.set_attr("Scale_weights", scale_w) - conv_op_node.set_attr("Scale_in", scale_in) - conv_op_node.set_attr("Scale_out", 1.0) - conv_op_node.set_attr("use_mkldnn", 1) - conv_op_node.set_attr("force_fp32_output", 1) - graph.link_to(input_var_node, conv_op_node) - graph.link_to(weight_var_node, conv_op_node) - graph.link_to(conv_op_node, output_var_node) - graph.safe_remove_nodes(op_node) - - def _transform_to_mul_mkldnn(self, graph, op_node): - # For MKL-DNN INT8 mul, input Y should be the weights - weight_name = op_node.input("Y")[0] - output_name = op_node.output("Out")[0] - # Convert int8 range weights to fp32 range weights - weight = self._load_param(self._scope, weight_name) - w_fp32 = np.divide( - np.multiply(weight, self._s8_max), self._max_range[output_name]) - w_fp32 = w_fp32.reshape(weight.shape) - self._restore_var(weight_name, w_fp32) - input_var_node = graph._find_node_by_name(op_node.inputs, - op_node.input("X")[0]) - weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name) - - # Set fake_dequantize_abs_max's output as new output of mul - output_var_node = graph._find_node_by_name( - graph.all_var_nodes(), self._new_output[output_name]) - attrs = { - name: op_node.op().attr(name) - for name in op_node.op().attr_names() - } - - mul_op_node = graph.create_op_node( - op_type='mul', - attrs=attrs, - inputs={'X': input_var_node, - 'Y': weight_var_node}, - outputs={'Out': output_var_node}) - - # Based on the QAT's scales to calculate MKL-DNN INT8 mul's scales - scale_in = self._s8_max / self._in_scale[output_name] - scale_w = [] - scale_w = [self._max_range[output_name] / self._s8_max] - - mul_op_node.set_attr("scale_y", scale_w) - mul_op_node.set_attr("scale_x", scale_in) - mul_op_node.set_attr("scale_out", 1.0) - mul_op_node.set_attr("use_mkldnn", 1) - mul_op_node.set_attr("force_fp32_output", 1) - graph.link_to(input_var_node, mul_op_node) - graph.link_to(weight_var_node, mul_op_node) - graph.link_to(mul_op_node, output_var_node) - graph.safe_remove_nodes(op_node) - - def _transform_to_quantize_mkldnn(self, graph, op_node): - """ - Transform fake_quantize_xx op to quantize mkldnn op in the graph. - """ - input_var_node = graph._find_node_by_name(op_node.inputs, - op_node.input("X")[0]) - output_var_node = graph._find_node_by_name(op_node.outputs, - op_node.output("Out")[0]) - scale_in = self._s8_max / self._load_param( - self._scope, op_node.input("InScale")[0])[0] - quant_op_node = graph.create_op_node( - op_type='quantize', - attrs={ - 'data_format': 'MKLDNNLAYOUT', - 'use_mkldnn': 1, - 'Scale': scale_in, - 'is_negative_input': 1 - }, - inputs={'Input': input_var_node}, - outputs={'Output': output_var_node}) - graph.link_to(input_var_node, quant_op_node) - graph.link_to(quant_op_node, output_var_node) - graph.safe_remove_nodes(op_node) - - def _remove_fake_dequantize_op(self, graph, op_node): - input_var_node = graph._find_node_by_name(op_node.inputs, - op_node.input("X")[0]) - graph.safe_remove_nodes(op_node) - - def _load_param(self, scope, param_name): - return np.array(scope.find_var(param_name).get_tensor()) - - def _restore_var(self, name, array): - tensor = self._scope.find_var(name).get_tensor() - tensor.set(array, self._place) - - def _remove_unused_var_nodes(self, graph): - all_used_vars = set() - ops = graph.all_op_nodes() - for op_node in ops: - for input_node in op_node.inputs: - all_used_vars.add(input_node) - for output_node in op_node.outputs: - all_used_vars.add(output_node) - - all_used_vars = {n.node for n in all_used_vars} - all_unused_vars = { - n - for n in filter(lambda node: node.node not in all_used_vars, - graph.all_var_nodes()) - } - graph.safe_remove_nodes(all_unused_vars) +__all__ = ['Qat2Int8MkldnnPass'] class Qat2Int8MkldnnPass(object): diff --git a/python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..51f8497e21cd78fb6cda7f626553168844dd1215 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py @@ -0,0 +1,276 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from .... import core +from ....framework import IrGraph +from ....framework import IrNode + +__all__ = ['QatInt8MkldnnPass'] + + +class QatInt8MkldnnPass(object): + """ + Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8 + IrGraph. Following transformations did in this pass: + 1. Convert int8 range weights with float32 data type, which are generated by + the QuantizationFreezePass, to float32 range weights with float32 data type + by using the corresponding scales. This conversion is because MKL-DNN INT8 + conv2d kernel and mul kernel now only support float32 weights input, hence + weights quantization will happen inside the conv2d and mul INT8 kernel. + 2. Create the new conv2d or mul op with the converted weights and link its output + to fake_dequantize_abs_max op's output and set conv2d's attribute "force_fp32 + _output" as true + 3. Transform fake_quantize_xx op to quantize op + 4. Remove fake_dequantize_abs_max op + """ + + def __init__(self, _scope=None, _place=None): + """ + Args: + scope(fluid.Scope): scope is used to initialize the new parameters. + place(fluid.CPUPlace): place is used to initialize the new parameters. + + + Examples: + .. code-block:: python + # The original graph will be rewrite. + import paddle.fluid as fluid + from paddle.fluid.contrib.slim.quantization \ + import QatInt8MkldnnPass + from paddle.fluid.framework import IrGraph + from paddle.fluid import core + + graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False) + place = fluid.CPUPlace() + mkldnn_pass = QatInt8MkldnnPass(fluid.global_scope(), + place) + mkldnn_pass.apply(graph) + """ + + self._scope = _scope + self._place = _place + + self._quantize_type = [ + 'fake_quantize_moving_average_abs_max', + 'fake_quantize_range_abs_max' + ] + self._dequantize_type = ['fake_dequantize_max_abs'] + self._quantize_dequantize_type = [ + 'fake_quantize_dequantize_moving_average_abs_max' + ] + + self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + self._conv_ops = ['conv2d', 'depthwise_conv2d'] + self._pool_ops = ['pool2d'] + + self._in_scale = {} + self._max_range = {} + self._new_output = {} + self._s8_max = 127 + + def apply(self, graph): + """ + Quantize the graph for running MKL-DNN INT8 inference. According + to activation quantization type, the graph will transform fake + quantize ops to quantize ops and remove the fake dequantize ops. + + Args: + graph(IrGraph): the applied graph. + """ + + assert isinstance(graph, + IrGraph), 'graph must be the instance of IrGraph.' + ops = graph.all_op_nodes() + + persistable_vars = [p.name() for p in graph.all_persistable_nodes()] + # Collect the _in_scales and _max_range to calculate the new scales for MKL-DNN + # INT8 conv2d and mul + for op_node in ops: + if op_node.name() in self._dequantize_type: + input_name = op_node.input("X")[0] + scale_name = op_node.input("Scale")[0] + self._in_scale[input_name] = self._load_param(self._scope, + scale_name)[0] + self._max_range[input_name] = op_node.op().attr("max_range") + self._new_output[input_name] = op_node.output("Out")[0] + + if op_node.name() in self._quantize_dequantize_type: + inputs = op_node.op().input_names() + attrs = op_node.op().attr_names() + input_name = op_node.input("X")[0] + scale_name = op_node.input("InScale")[0] + self._in_scale[input_name] = self._load_param(self._scope, + scale_name)[0] + # self._max_range[input_name] = op_node.op().attr("max_range") + self._new_output[input_name] = op_node.output("Out")[0] + + for op_node in ops: + if op_node.name() in self._quantizable_ops: + if op_node.name() in self._conv_ops: + self._transform_to_conv_mkldnn(graph, op_node) + elif op_node.name() in self._pool_ops: + self._transform_to_pool_mkldnn(graph, op_node) + else: + self._transform_to_mul_mkldnn(graph, op_node) + elif op_node.name() in self._quantize_type: + self._transform_to_quantize_mkldnn(graph, op_node) + elif op_node.name() in self._dequantize_type: + self._remove_fake_dequantize_op(graph, op_node) + self._remove_unused_var_nodes(graph) + return graph + + def _transform_to_pool_mkldnn(self, graph, op): + output_name = op.output("Out")[0] + input_name = op.input("X")[0] + + def _transform_to_conv_mkldnn(self, graph, op_node): + weight_name = op_node.input("Filter")[0] + output_name = op_node.output("Output")[0] + # Convert int8 range weights to fp32 range weights + weight = self._load_param(self._scope, weight_name) + w_fp32 = np.divide( + np.multiply(weight, self._s8_max), self._max_range[output_name]) + w_fp32 = w_fp32.reshape(weight.shape) + self._restore_var(weight_name, w_fp32) + input_var_node = graph._find_node_by_name(op_node.inputs, + op_node.input("Input")[0]) + weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name) + + # Set fake_dequantize_abs_max's output as new output of conv2d + output_var_node = graph._find_node_by_name( + graph.all_var_nodes(), self._new_output[output_name]) + attrs = { + name: op_node.op().attr(name) + for name in op_node.op().attr_names() + } + + conv_op_node = graph.create_op_node( + op_type='conv2d', + attrs=attrs, + inputs={'Input': input_var_node, + 'Filter': weight_var_node}, + outputs={'Output': output_var_node}) + + # Based on the QAT's scales to calculate the scales of MKL-DNN INT8 conv2d + scale_in = self._s8_max / self._in_scale[output_name] + scale_w = [] + scale_w = [self._max_range[output_name] / self._s8_max] + + conv_op_node.set_attr("Scale_weights", scale_w) + conv_op_node.set_attr("Scale_in", scale_in) + conv_op_node.set_attr("Scale_out", 1.0) + conv_op_node.set_attr("use_mkldnn", 1) + conv_op_node.set_attr("force_fp32_output", 1) + graph.link_to(input_var_node, conv_op_node) + graph.link_to(weight_var_node, conv_op_node) + graph.link_to(conv_op_node, output_var_node) + graph.safe_remove_nodes(op_node) + + def _transform_to_mul_mkldnn(self, graph, op_node): + # For MKL-DNN INT8 mul, input Y should be the weights + weight_name = op_node.input("Y")[0] + output_name = op_node.output("Out")[0] + # Convert int8 range weights to fp32 range weights + weight = self._load_param(self._scope, weight_name) + w_fp32 = np.divide( + np.multiply(weight, self._s8_max), self._max_range[output_name]) + w_fp32 = w_fp32.reshape(weight.shape) + self._restore_var(weight_name, w_fp32) + input_var_node = graph._find_node_by_name(op_node.inputs, + op_node.input("X")[0]) + weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name) + + # Set fake_dequantize_abs_max's output as new output of mul + output_var_node = graph._find_node_by_name( + graph.all_var_nodes(), self._new_output[output_name]) + attrs = { + name: op_node.op().attr(name) + for name in op_node.op().attr_names() + } + + mul_op_node = graph.create_op_node( + op_type='mul', + attrs=attrs, + inputs={'X': input_var_node, + 'Y': weight_var_node}, + outputs={'Out': output_var_node}) + + # Based on the QAT's scales to calculate MKL-DNN INT8 mul's scales + scale_in = self._s8_max / self._in_scale[output_name] + scale_w = [] + scale_w = [self._max_range[output_name] / self._s8_max] + + mul_op_node.set_attr("scale_y", scale_w) + mul_op_node.set_attr("scale_x", scale_in) + mul_op_node.set_attr("scale_out", 1.0) + mul_op_node.set_attr("use_mkldnn", 1) + mul_op_node.set_attr("force_fp32_output", 1) + graph.link_to(input_var_node, mul_op_node) + graph.link_to(weight_var_node, mul_op_node) + graph.link_to(mul_op_node, output_var_node) + graph.safe_remove_nodes(op_node) + + def _transform_to_quantize_mkldnn(self, graph, op_node): + """ + Transform fake_quantize_xx op to quantize mkldnn op in the graph. + """ + input_var_node = graph._find_node_by_name(op_node.inputs, + op_node.input("X")[0]) + output_var_node = graph._find_node_by_name(op_node.outputs, + op_node.output("Out")[0]) + scale_in = self._s8_max / self._load_param( + self._scope, op_node.input("InScale")[0])[0] + quant_op_node = graph.create_op_node( + op_type='quantize', + attrs={ + 'data_format': 'MKLDNNLAYOUT', + 'use_mkldnn': 1, + 'Scale': scale_in, + 'is_negative_input': 1 + }, + inputs={'Input': input_var_node}, + outputs={'Output': output_var_node}) + graph.link_to(input_var_node, quant_op_node) + graph.link_to(quant_op_node, output_var_node) + graph.safe_remove_nodes(op_node) + + def _remove_fake_dequantize_op(self, graph, op_node): + input_var_node = graph._find_node_by_name(op_node.inputs, + op_node.input("X")[0]) + graph.safe_remove_nodes(op_node) + + def _load_param(self, scope, param_name): + return np.array(scope.find_var(param_name).get_tensor()) + + def _restore_var(self, name, array): + tensor = self._scope.find_var(name).get_tensor() + tensor.set(array, self._place) + + def _remove_unused_var_nodes(self, graph): + all_used_vars = set() + ops = graph.all_op_nodes() + for op_node in ops: + for input_node in op_node.inputs: + all_used_vars.add(input_node) + for output_node in op_node.outputs: + all_used_vars.add(output_node) + + all_used_vars = {n.node for n in all_used_vars} + all_unused_vars = { + n + for n in filter(lambda node: node.node not in all_used_vars, + graph.all_var_nodes()) + } + graph.safe_remove_nodes(all_unused_vars)