quantization_pass.py 62.6 KB
Newer Older
W
WangZhen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
W
WangZhen 已提交
16
import numpy as np
W
WangZhen 已提交
17
from ..... import compat as cpt
W
WangZhen 已提交
18
from .... import core
19
from ....framework import IrGraph
20
from ....framework import IrNode
21
from ....framework import Operator
W
WangZhen 已提交
22 23
from .... import unique_name

24 25
__all__ = [
    'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
26 27
    'TransformForMobilePass', 'OutScaleForTrainingPass',
    'OutScaleForInferencePass', 'AddQuantDequantPass'
28
]
W
WangZhen 已提交
29

30 31 32 33 34 35 36 37 38
_fake_quant_op_list = [
    'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
    'fake_quantize_moving_average_abs_max', 'fake_channel_wise_quantize_abs_max'
]

_fake_dequant_op_list = [
    'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
]

39 40 41 42
_fake_quant_dequant_op_list = [
    'fake_quantize_dequantize_moving_average_abs_max'
]

43
_out_scale_op_list = [
44 45
    "conv2d", "depthwise_conv2d", "mul", "matmul", "relu", "leaky_relu",
    "relu6", "sigmoid", "tanh", "prelu", "swish", "softmax", "batch_norm",
46
    "elementwise_add", "pool2d", "reshape2", "transpose2", "concat"
47 48
]

49 50 51
# list op real input and output names, to avoid processing input such as AxisTensor.
_op_real_in_out_name = {
    "conv2d": [["Input", "Filter"], ["Output"]],
52
    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
53
    "mul": [["X", "Y"], ["Out"]],
54
    "matmul": [["X", "Y"], ["Out"]],
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
    "pool2d": [["X"], ["Out"]],
    "elementwise_add": [["X", "Y"], ["Out"]],
    "concat": [["X"], ["Out"]],
    "softmax": [["X"], ["Out"]],
    "argmax": [["X"], ["Out"]],
    "transpose": [["X"], ["Out"]],
    "equal": [["X", "Y"], ["Out"]],
    "gather": [["X"], ["Out"]],
    "greater_equal": [["X", "Y"], ["Out"]],
    "greater_than": [["X", "Y"], ["Out"]],
    "less_equal": [["X", "Y"], ["Out"]],
    "less_than": [["X", "Y"], ["Out"]],
    "mean": [["X"], ["Out"]],
    "not_equal": [["X", "Y"], ["Out"]],
    "reshape": [["X"], ["Out"]],
    "reshape2": [["X"], ["Out"]],
71
    "transpose2": [["X"], ["Out"]],
72 73 74 75 76 77 78 79 80
    "bilinear_interp": [["X"], ["Out"]],
    "nearest_interp": [["X"], ["Out"]],
    "trilinear_interp": [["X"], ["Out"]],
    "slice": [["Input"], ["Out"]],
    "squeeze": [["X"], ["Out"]],
    "elementwise_sub": [["X", "Y"], ["Out"]],
    "relu": [["X"], ["Out"]],
    "relu6": [["X"], ["Out"]],
    "leaky_relu": [["X"], ["Out"]],
81
    "prelu": [["X"], ["Out"]],
82 83
    "tanh": [["X"], ["Out"]],
    "swish": [["X"], ["Out"]],
84 85 86
    "dropout": [["X"], ["Out"]],
    "batch_norm": [["X"], ["Y"]],
    "sigmoid": [["X"], ["Y"]],
87 88
}

W
WangZhen 已提交
89

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
def _get_op_input_var_names(op):
    """ """
    assert isinstance(op, (IrNode, Operator)), \
        "The input op should be IrNode or Operator."
    var_names = []
    op_name = op.name() if isinstance(op, IrNode) \
        else op.type
    name_list = _op_real_in_out_name[op_name][0]
    for name in name_list:
        var_name = op.input(name)
        if isinstance(var_name, list):
            var_names.extend(var_name)
        else:
            var_names.append(var_name)
    return var_names


def _get_op_output_var_names(op):
    """ """
    assert isinstance(op, (IrNode, Operator)), \
        "The input op should be IrNode or Operator."
    var_names = []
    op_name = op.name() if isinstance(op, IrNode) \
        else op.type
    name_list = _op_real_in_out_name[op_name][1]
    for name in name_list:
        var_name = op.output(name)
        if isinstance(var_name, list):
            var_names.extend(var_name)
        else:
            var_names.append(var_name)
    return var_names


124 125 126 127
def _init_var_node(var_node, value, scope, place):
    assert isinstance(value,
                      np.ndarray), 'The type of value should be numpy array.'
    assert scope is not None, \
128
        'The scope cannot be set None.'
129
    assert place is not None, \
130
        'The place cannot be set None.'
131 132 133 134
    tensor = scope.var(var_node.name()).get_tensor()
    tensor.set(value, place)


135 136 137 138 139
def _is_input_all_not_persistable(graph, op_node):
    '''
    Analyse the real inputs of the op node are all not persistable.
    '''
    is_input_all_not_persistable = True
140 141 142 143
    for var_name in _get_op_input_var_names(op_node):
        in_node = graph._find_node_by_name(op_node.inputs, var_name)
        is_input_all_not_persistable = (is_input_all_not_persistable and \
            (not in_node.persistable()))
144 145 146
    return is_input_all_not_persistable


147
class QuantizationTransformPass(object):
148 149 150 151
    """
    Quantize the ops that have weights. Add quant and dequant ops for the quantized
    ops's inputs.
    """
152 153 154
    _supported_quantizable_op_type = [
        'conv2d', 'depthwise_conv2d', 'mul', 'matmul'
    ]
155

W
WangZhen 已提交
156
    def __init__(self,
157
                 scope=None,
158
                 place=None,
W
WangZhen 已提交
159 160 161 162
                 weight_bits=8,
                 activation_bits=8,
                 activation_quantize_type='abs_max',
                 weight_quantize_type='abs_max',
163
                 window_size=10000,
164
                 moving_rate=0.9,
165
                 skip_pattern=['skip_quant'],
166
                 quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']):
W
WangZhen 已提交
167
        """
168
        Constructor.
169

W
WangZhen 已提交
170
        Args:
171
            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
172 173
                type, this pass will create some new parameters. The scope is used to
                initialize these new parameters.
174
            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to initialize new
175
                parameters described above.
176
            weight_bits(int): quantization bit number for weights,
W
WangZhen 已提交
177
                the bias is not quantized.
178 179
            activation_bits(int): quantization bit number for activation.
            activation_quantize_type(str): quantization type for activation,
180 181 182 183 184
                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
                If use 'abs_max' mode, the quantization scale will be calculated
                dynamically each step in both training and testing period. If use
                'range_abs_max', a static quantization scale will be calculated
                during training and used in inference.
185
            weight_quantize_type(str): quantization type for weights,
186 187 188
                support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
                usually is not used for weight, since weights are fixed once the
                model is well trained.
189 190
            window_size(int): the window size for 'range_abs_max' quantization.
            moving_rate(float): the param for 'moving_average_abs_max' quantization.
191
            skip_pattern(str or str list): The user-defined quantization skip pattern, which
192
                will be presented in the name scope of an op. When the skip pattern is
193
                detected in an op's name scope, the corresponding op will not be quantized. 
194
            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
195 196
                Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
197

W
WangZhen 已提交
198 199
        Examples:
        .. code-block:: python
200 201 202 203
            # The original graph will be rewrite.
            import paddle.fluid as fluid
            from paddle.fluid.contrib.slim.quantization \
                import QuantizationTransformPass
204
            from paddle.fluid.contrib.slim.graph import IrGraph
205 206
            from paddle.fluid import core

207
            graph = IrGraph(core.Graph(program.desc), for_test=False)
208
            place = fluid.CPUPlace()
209
            transform_pass = QuantizationTransformPass(fluid.global_scope(),
210
            place)
211
            transform_pass.apply(graph)
W
WangZhen 已提交
212
        """
213
        self._scope = scope
214
        self._place = place
215 216
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits
217
        self._skip_pattern = skip_pattern
W
WangZhen 已提交
218

219 220 221 222
        quant_type = [
            'abs_max', 'channel_wise_abs_max', 'range_abs_max',
            'moving_average_abs_max'
        ]
223 224
        assert activation_quantize_type != 'channel_wise_abs_max', \
            "The activation quantization type does not support 'channel_wise_abs_max'."
W
WangZhen 已提交
225 226
        if activation_quantize_type not in quant_type:
            raise ValueError(
227 228 229
                "Unknown activation_quantize_type : '%s'. It can only be "
                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
                (str(activation_quantize_type)))
W
WangZhen 已提交
230 231
        if weight_quantize_type not in quant_type:
            raise ValueError(
232 233 234
                "Unknown weight_quantize_type: '%s'. It can only be "
                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
                % (str(weight_quantize_type)))
W
WangZhen 已提交
235

236 237 238
        self._activation_quantize_type = activation_quantize_type
        self._weight_quantize_type = weight_quantize_type
        self._window_size = window_size
239
        self._moving_rate = moving_rate
W
WangZhen 已提交
240

241 242
        self._quantizable_ops = quantizable_op_type
        for op in self._quantizable_ops:
243
            assert op in QuantizationTransformPass._supported_quantizable_op_type, \
244
                op + " is not supported for quantization."
245
        self._conv_ops = ['conv2d', 'depthwise_conv2d']
246 247
        self._quantizable_grad_ops = [
            '%s_grad' % (op) for op in self._quantizable_ops
W
WangZhen 已提交
248
        ]
249 250
        self._is_test = None
        self._global_step = None
W
WangZhen 已提交
251

252
    def apply(self, graph):
253 254 255 256 257 258 259
        """
        Quantize the graph for training process. According to weight and
        activation quantization type, the graph will be added some fake
        quantize operators and fake dequantize operators.

        Args:
            graph(IrGraph): the applied graph.
260 261
        Returns:
            None
262
        """
W
WangZhen 已提交
263
        assert isinstance(graph,
264 265
                          IrGraph), 'graph must be the instance of IrGraph.'
        self._is_test = graph.is_test()
W
WangZhen 已提交
266 267
        # marked the variable which has been dequantized.
        dequantized_vars = collections.OrderedDict()
268
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
W
WangZhen 已提交
269

270
        def _quant_preprocess(op_node):
271 272 273 274 275 276 277
            user_skipped = False
            if isinstance(self._skip_pattern, list):
                user_skipped = op_node.op().has_attr("op_namescope") and \
                               any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
            elif isinstance(self._skip_pattern, str):
                user_skipped = op_node.op().has_attr("op_namescope") and \
                               op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
278

279
            if user_skipped:
280 281
                op_node.op()._set_attr("skip_quant", True)

W
WangZhen 已提交
282
        def _transform_forward(graph, op):
283
            op.op()._set_attr("quantization_type", "qat_with_weight")
W
WangZhen 已提交
284
            for var_node in op.inputs:
285 286
                if var_node.name() not in op.input_arg_names():
                    continue
W
WangZhen 已提交
287 288 289
                if var_node.name() in dequantized_vars:
                    dequant_var_node = dequantized_vars[var_node.name()]
                else:
W
WangZhen 已提交
290
                    quant_bits = self._weight_bits if var_node.name() in persistable_vars \
291
                        else self._activation_bits
292
                    quant_type = self._weight_quantize_type if var_node.name() \
W
WangZhen 已提交
293
                        in persistable_vars else self._activation_quantize_type
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
                    if quant_type == 'channel_wise_abs_max':
                        assert var_node.name(
                        ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights."
                        if op.name() in self._conv_ops:
                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
                                graph, var_node, quant_bits)
                            dequant_var_node = self._insert_channel_dequant_op(
                                graph, quant_var_node, [scale_var_node],
                                [quant_bits])
                        else:
                            quant_var_node, scale_var_node = self._insert_quant_op(
                                graph, var_node, quant_bits, 'abs_max')
                            dequant_var_node = self._insert_dequant_op(
                                graph, quant_var_node, scale_var_node,
                                quant_bits)
                    else:
                        quant_var_node, scale_var_node = self._insert_quant_op(
                            graph, var_node, quant_bits, quant_type)
                        dequant_var_node = self._insert_dequant_op(
                            graph, quant_var_node, scale_var_node, quant_bits)
W
WangZhen 已提交
314
                    dequantized_vars[var_node.name()] = dequant_var_node
315
                graph.update_input_link(var_node, dequant_var_node, op)
W
WangZhen 已提交
316 317 318

        def _transform_backward(graph, op):
            for var_node in op.inputs:
319 320
                if var_node.name() not in op.input_arg_names():
                    continue
W
WangZhen 已提交
321 322
                if var_node.name() in dequantized_vars:
                    dequant_var_node = dequantized_vars[var_node.name()]
323
                    graph.update_input_link(var_node, dequant_var_node, op)
W
WangZhen 已提交
324

325
        if not self._is_test:
W
WangZhen 已提交
326
            self._create_global_step(graph)
327
        ops = graph.all_op_nodes()
328 329 330 331 332 333
        # Do the preproccess of quantization, such as skipping some ops
        # for not being quantized.
        for op in ops:
            if op.name() in self._quantizable_ops or \
                    op.name() in self._quantizable_grad_ops:
                _quant_preprocess(op)
W
WangZhen 已提交
334 335
        # The process of _transform_forward and _transform_backward is needed in two for loops.
        # The loop for transforming the forward graph:
W
WangZhen 已提交
336
        for op in ops:
337
            if op.name() in self._quantizable_ops:
338
                if not self._is_skip_quant(graph, op):
339
                    _transform_forward(graph, op)
W
WangZhen 已提交
340 341
        # The loop for renaming the inputs of backward op.
        for op in ops:
342
            if op.name() in self._quantizable_grad_ops:
W
WangZhen 已提交
343
                _transform_backward(graph, op)
Z
Zhen Wang 已提交
344
        graph.resolve_hazard()
345
        return graph
W
WangZhen 已提交
346

W
WangZhen 已提交
347
    def _create_global_step(self, graph):
348 349
        if self._weight_quantize_type == 'range_abs_max' or \
                self._activation_quantize_type == 'range_abs_max':
W
WangZhen 已提交
350
            counter_name = cpt.to_text('@STEP_COUNTER@')
351
            for node in graph.all_var_nodes():
W
WangZhen 已提交
352
                if node.name() == counter_name:
353 354
                    self._global_step = node
            if self._global_step is None:
355
                global_step_in = graph.create_persistable_node(
W
WangZhen 已提交
356 357 358 359
                    name=counter_name,
                    var_type=core.VarDesc.VarType.LOD_TENSOR,
                    shape=[1],
                    var_dtype=core.VarDesc.VarType.INT64)
360 361 362 363 364 365
                _init_var_node(
                    global_step_in,
                    np.zeros(
                        [1], dtype='int64'),
                    self._scope,
                    self._place)
W
WangZhen 已提交
366 367
                global_step_out = graph.create_var_node_from_desc(
                    global_step_in.var())
368
                # The attribute of `op_role` is needed by ParallelExecutor.
W
WangZhen 已提交
369 370
                increment_op = graph.create_op_node(
                    op_type='increment',
371 372 373 374 375
                    attrs={
                        'step': 1.0,
                        'op_role':
                        core.op_proto_and_checker_maker.OpRole.Forward
                    },
W
WangZhen 已提交
376 377
                    inputs={'X': global_step_in},
                    outputs={'Out': global_step_out})
378 379 380
                graph.link_to(global_step_in, increment_op)
                graph.link_to(increment_op, global_step_out)
                self._global_step = global_step_out
W
WangZhen 已提交
381

W
WangZhen 已提交
382 383 384 385 386 387 388
    def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
        """
        Insert fake_quantize_op in the graph.
        """
        if quant_type == 'abs_max':
            return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
        elif quant_type == 'range_abs_max':
W
WangZhen 已提交
389 390
            return self._insert_quant_range_abs_max_op(graph, var_node,
                                                       quant_bits)
391 392 393
        elif quant_type == 'moving_average_abs_max':
            return self._insert_quant_moving_average_abs_max_op(graph, var_node,
                                                                quant_bits)
W
WangZhen 已提交
394 395 396 397 398 399 400 401 402

    def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
        """
        Insert fake_quantize_abs_max op in the graph.
        """
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())

        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
403 404 405
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
W
WangZhen 已提交
406 407
        scale_var_node = graph.create_var_node(
            name=self._quantized_scale_name(var_node.name()),
408
            var_type=var_node.type(),
409
            shape=[1],
410
            var_dtype=var_node.dtype())
W
WangZhen 已提交
411 412
        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_abs_max',
413 414 415 416
            attrs={
                'bit_length': quant_bits,
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
W
WangZhen 已提交
417 418 419
            inputs={'X': var_node},
            outputs={'Out': quant_var_node,
                     'OutScale': scale_var_node})
420 421 422
        graph.link_to(var_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        graph.link_to(quant_op_node, scale_var_node)
W
WangZhen 已提交
423 424 425 426 427 428 429 430 431 432
        return quant_var_node, scale_var_node

    def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
        """
        Insert fake_quantize_range_abs_max on the graph.
        """
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())

        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
433 434 435
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
W
WangZhen 已提交
436

437
        scale_in_node = graph.create_persistable_node(
W
WangZhen 已提交
438 439 440
            name=self._quantized_scale_name(var_node.name()),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
441
            var_dtype=var_node.dtype())
442 443
        data_type = 'float64' if var_node.dtype(
        ) == core.VarDesc.VarType.FP64 else 'float32'
444 445 446 447 448 449
        _init_var_node(
            scale_in_node,
            np.array(
                [0.001], dtype=data_type),
            self._scope,
            self._place)
W
WangZhen 已提交
450 451 452 453 454

        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
        inputs = {'X': var_node, 'InScale': scale_in_node}
        outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}

455
        if not self._is_test:
W
WangZhen 已提交
456
            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
457
            scales_node = graph.create_persistable_node(
W
WangZhen 已提交
458 459
                name=unique_name.generate('scales'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
460
                shape=[self._window_size],
461
                var_dtype=var_node.dtype())
462 463
            data_type = 'float64' if var_node.dtype(
            ) == core.VarDesc.VarType.FP64 else 'float32'
464 465 466 467 468 469 470
            _init_var_node(
                scales_node,
                np.zeros(
                    [self._window_size], dtype=data_type),
                self._scope,
                self._place)

471
            inputs['Iter'] = self._global_step
W
WangZhen 已提交
472 473
            outputs['OutScales'] = scales_node
        attrs = {
474
            'window_size': self._window_size,
W
WangZhen 已提交
475
            'bit_length': quant_bits,
476 477
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
W
WangZhen 已提交
478 479 480 481 482 483 484
        }
        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_range_abs_max',
            attrs=attrs,
            inputs=inputs,
            outputs=outputs)

485 486 487 488
        graph.link_to(var_node, quant_op_node)
        graph.link_to(scale_in_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        graph.link_to(quant_op_node, scale_out_node)
W
WangZhen 已提交
489

490 491 492
        if not self._is_test:
            graph.link_to(self._global_step, quant_op_node)
            graph.link_to(quant_op_node, scales_node)
W
WangZhen 已提交
493 494 495

        return quant_var_node, scale_out_node

496 497 498 499 500 501 502 503 504 505 506 507 508 509
    def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
                                                quant_bits):
        """Insert fake_quantize_moving_average_abs_max
        """
        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
        scale_in_node = graph.create_persistable_node(
            name=self._quantized_scale_name(var_node.name()),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
            var_dtype=var_node.dtype())
510 511
        data_type = 'float64' if var_node.dtype(
        ) == core.VarDesc.VarType.FP64 else 'float32'
512 513 514 515 516 517
        _init_var_node(
            scale_in_node,
            np.array(
                [0.001], dtype=data_type),
            self._scope,
            self._place)
518 519 520 521 522 523 524 525 526 527

        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
        ins = {'X': var_node, 'InScale': scale_in_node}
        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
        if not self._is_test:
            state_in_node = graph.create_persistable_node(
                name=unique_name.generate('state'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
528 529
            data_type = 'float64' if var_node.dtype(
            ) == core.VarDesc.VarType.FP64 else 'float32'
530
            _init_var_node(
531
                state_in_node,
532 533 534 535
                np.ones(
                    [1], dtype=data_type),
                self._scope,
                self._place)
536 537 538 539 540
            accum_in_node = graph.create_persistable_node(
                name=unique_name.generate('accum'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
541 542 543 544 545 546
            _init_var_node(
                accum_in_node,
                np.ones(
                    [1], dtype=data_type),
                self._scope,
                self._place)
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
            ))
            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
            ))

            ins['InState'] = state_in_node
            ins['InAccum'] = accum_in_node
            outs['OutState'] = state_out_node
            outs['OutAccum'] = accum_out_node

        attrs = {
            'bit_length': quant_bits,
            'moving_rate': self._moving_rate,
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
        }

        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_moving_average_abs_max',
            attrs=attrs,
            inputs=ins,
            outputs=outs)

        graph.link_to(var_node, quant_op_node)
        graph.link_to(scale_in_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        graph.link_to(quant_op_node, scale_out_node)

        if not self._is_test:
            graph.link_to(state_in_node, quant_op_node)
            graph.link_to(accum_in_node, quant_op_node)
            graph.link_to(quant_op_node, state_out_node)
            graph.link_to(quant_op_node, accum_out_node)

        return quant_var_node, scale_out_node

583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
    def _insert_channel_quant_op(self, graph, var_node, quant_bits):
        """
        Insert fake_channel_wise_quantize_abs_max op in the graph.
        """
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())

        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
        scale_var_node = graph.create_var_node(
            name=self._quantized_scale_name(var_node.name()),
            var_type=var_node.type(),
            shape=[var_node.shape()[0]],
            var_dtype=var_node.dtype())
        quant_op_node = graph.create_op_node(
            op_type='fake_channel_wise_quantize_abs_max',
            attrs={
                'bit_length': quant_bits,
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
            inputs={'X': var_node},
            outputs={'Out': quant_var_node,
                     'OutScale': scale_var_node})
        graph.link_to(var_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        graph.link_to(quant_op_node, scale_var_node)
        return quant_var_node, scale_var_node

W
WangZhen 已提交
613 614 615 616 617 618 619 620
    def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
        """
        Insert fake_dequantize_op in the graph.
        """
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())

        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(var_node.name()),
621 622 623
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
W
WangZhen 已提交
624 625 626
        max_range = (1 << (quant_bits - 1)) - 1
        dequant_op_node = graph.create_op_node(
            op_type='fake_dequantize_max_abs',
627 628 629 630
            attrs={
                'max_range': float(max_range),
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
W
WangZhen 已提交
631 632 633
            inputs={'X': var_node,
                    'Scale': scale_var_node},
            outputs={'Out': dequant_var_node})
634 635 636
        graph.link_to(var_node, dequant_op_node)
        graph.link_to(scale_var_node, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
W
WangZhen 已提交
637 638
        return dequant_var_node

639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
    def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
                                   quant_bits):
        """
        Insert fake_channel_wise_dequantize_max_abs in the graph.
        """
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())

        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(var_node.name()),
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
        dequant_op_node = graph.create_op_node(
            op_type='fake_channel_wise_dequantize_max_abs',
            attrs={
                'quant_bits': quant_bits,
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
            inputs={'X': var_node,
                    'Scales': scale_var_nodes},
            outputs={'Out': dequant_var_node})
        graph.link_to(var_node, dequant_op_node)
        for scale_n in scale_var_nodes:
            graph.link_to(scale_n, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
        return dequant_var_node

W
WangZhen 已提交
666 667 668 669 670 671 672 673 674 675 676 677 678 679
    def _quantized_var_name(self, var_name):
        """
        Return quantized variable name for the input `var_name`.
        """
        return "%s.quantized" % (var_name)

    def _dequantized_var_name(self, var_name):
        """
        Return dequantized variable name for the input `var_name`.
        """
        return "%s.dequantized" % (var_name)

    def _quantized_scale_name(self, var_name):
        """
680
        Return the scale name of quantized variable for the input `var_name`.
W
WangZhen 已提交
681 682
        """
        return "%s.scale" % (var_name)
W
WangZhen 已提交
683

684
    def _is_skip_quant(self, graph, op_node):
685 686 687 688 689 690 691 692 693 694 695 696
        """
        Analyse whether the op node skips quantization.
        """
        is_skip = False
        if op_node.op().has_attr("skip_quant") and \
            op_node.op().attr("skip_quant"):
            is_skip = True
        # if the inputs of mul and matmul are not all persistable, use
        # AddQuantDequantPass to quantize them.
        if op_node.name() in ["mul", "matmul"] and \
            _is_input_all_not_persistable(graph, op_node):
            is_skip = True
697 698 699
        if op_node.op().has_attr("quantization_type") and \
            op_node.op().attr("quantization_type") == "qat_without_weight":
            is_skip = True
700 701
        return is_skip

W
WangZhen 已提交
702 703 704 705 706 707 708

class QuantizationFreezePass(object):
    def __init__(self,
                 scope,
                 place,
                 weight_bits=8,
                 activation_bits=8,
709
                 weight_quantize_type='abs_max',
710
                 quantizable_op_type=None):
711 712
        """
        The freeze pass is used to adjust the quantize operator order, for example:
T
tianshuo78520a 已提交
713
            1) `activation -> quant -> dequant -> conv2d` will be frozen into
714
            `activation -> quant -> conv2d -> dequant`
T
tianshuo78520a 已提交
715 716
            2) `weight -> quant -> dequant -> conv2d` will be frozen into `weight -> conv2d`,
            and weight will be scaled offline.
717 718 719 720 721 722 723 724 725

        Args:
            scope(fluid.Scope): scope is used to get the weight tensor values.
            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
            weight_bits(int): quantization bit number for weights.
            activation_bits(int): quantization bit number for activation.
            weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
                'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, 
                since weights are fixed once the model is well trained.
726 727
            quantizable_op_type(list[str]): This input param will be removed latter. The pass
                will process all quantized op, so it is not necessary to set the input param.
728
        """
W
WangZhen 已提交
729 730 731 732 733 734 735 736 737
        assert scope is not None, \
            'The scope cannot be set None.'
        assert place is not None, \
            'The place cannot be set None.'
        self._scope = scope
        self._place = place
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits
        self._weight_quantize_type = weight_quantize_type
738
        self._conv_ops = ['conv2d', 'depthwise_conv2d']
739 740
        self._fake_quant_op_names = _fake_quant_op_list
        self._fake_dequant_op_names = _fake_dequant_op_list
W
WangZhen 已提交
741 742
        self._op_input_rename_map = collections.OrderedDict()
        self._op_output_rename_map = collections.OrderedDict()
743
        self._quant_var_scale_map = collections.OrderedDict()
W
WangZhen 已提交
744 745

    def apply(self, graph):
746 747 748 749 750
        """
        Adjust quantize/dequantize operators order for the inference process.

        Args:
            graph(IrGraph): the applied graph.
751 752
        Returns:
            None
753
        """
754
        # Get input scales in fake quant op and process weights
755 756
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        ops = graph.all_op_nodes()
W
WangZhen 已提交
757 758 759
        for op_node in ops:
            op_name = op_node.name()
            if op_name in self._fake_quant_op_names:
760
                input_arg_name = op_node.input('X')[0]
W
WangZhen 已提交
761 762 763 764
                if input_arg_name in persistable_vars:
                    if self._weight_quantize_type == 'abs_max':
                        param = self._load_var(input_arg_name)
                        scale_v = np.max(np.abs(param))
765 766 767 768 769 770 771 772
                    elif self._weight_quantize_type == 'channel_wise_abs_max':
                        param = self._load_var(input_arg_name)
                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
                            scale_v = []
                            for i in range(param.shape[0]):
                                scale_v.append(np.max(np.abs(param[i])))
                        else:
                            scale_v = np.max(np.abs(param))
W
WangZhen 已提交
773
                    else:
774 775
                        scale_v = self._load_var(
                            op_node.output('OutScale')[0])[0]
776
                    self._quant_var_scale_map[input_arg_name] = scale_v
W
WangZhen 已提交
777 778 779 780
                    self._remove_fake_quant_and_dequant_op(graph, op_node)
                    # quantize weight and restore
                    param_v = self._load_var(input_arg_name)
                    quantized_param_v = self._quant(param_v, scale_v,
W
WangZhen 已提交
781
                                                    self._weight_bits)
W
WangZhen 已提交
782
                    self._restore_var(input_arg_name, quantized_param_v)
783
                else:
784 785
                    scale_v = graph._find_node_by_name(
                        op_node.outputs, op_node.output('OutScale')[0])
786
                    self._quant_var_scale_map[input_arg_name] = scale_v
W
WangZhen 已提交
787

788
        # Remove all fake dequant op
789
        ops = graph.all_op_nodes()
W
WangZhen 已提交
790 791 792 793 794
        for op_node in ops:
            op_name = op_node.name()
            if op_name in self._fake_dequant_op_names:
                self._remove_fake_quant_and_dequant_op(graph, op_node)

795
        # Insert post dequant op
796
        ops = graph.all_op_nodes()
W
WangZhen 已提交
797
        for op_node in ops:
798 799 800 801 802 803 804 805
            op_node_desc = op_node.op()
            if op_node_desc.has_attr("quantization_type") and \
                op_node_desc.attr("quantization_type") == "qat_with_weight":
                if self._weight_quantize_type == 'channel_wise_abs_max' \
                    and op_node.name() in self._conv_ops:
                    self._insert_post_channel_dequant_op(graph, op_node)
                else:
                    self._insert_post_dequant_op(graph, op_node)
W
WangZhen 已提交
806

807
        # Rename inputs of the followed ops after inserting dequant_op after fc/conv
W
WangZhen 已提交
808 809
        for op_node in ops:
            for var_node in op_node.inputs:
810 811 812
                if var_node.node in self._op_output_rename_map:
                    old_in = var_node
                    new_in = self._op_output_rename_map[var_node.node]
W
WangZhen 已提交
813 814 815 816
                    graph.update_input_link(old_in, new_in, op_node)

        # remove the unused var node in the graph
        self._remove_unused_var_nodes(graph)
Z
Zhen Wang 已提交
817
        graph.resolve_hazard()
818
        return graph
W
WangZhen 已提交
819 820

    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
821 822
        k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0])
        v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0])
823 824
        if v.node not in self._op_input_rename_map:
            self._op_input_rename_map[k.node] = v
W
WangZhen 已提交
825
        else:
826 827
            self._op_input_rename_map[k.node] = self._op_input_rename_map[
                v.node]
W
WangZhen 已提交
828
        graph.safe_remove_nodes(op_node)
W
WangZhen 已提交
829

830 831 832 833
    def _insert_post_channel_dequant_op(self, graph, op_node):
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        for var_node in op_node.inputs:
            name = var_node.name()
834 835 836 837 838
            if name not in op_node.input_arg_names():
                continue
            if var_node.node in self._op_input_rename_map:
                old_in = var_node
                new_in = self._op_input_rename_map[var_node.node]
839 840 841
                new_in.clear_outputs()
                graph.update_input_link(old_in, new_in, op_node)
            original_var_name = self._original_var_name(name)
842
            scale_v = self._quant_var_scale_map[original_var_name]
843 844 845 846 847 848 849 850
            if original_var_name in persistable_vars:
                assert isinstance(
                    scale_v,
                    list), 'The scale of parameter %s is not a list.' % (
                        original_var_name)
                channel_scale = np.array(scale_v)
            else:
                assert isinstance(scale_v, IrNode)
851
                scale_var_node = self._quant_var_scale_map[original_var_name]
852

853
        if len(op_node.output_arg_names()) != 1:
854 855 856
            raise ValueError("Only support one output, but op %s has"
                             " more than one output." % (op_node.name()))

857 858
        output_var_node = graph._find_node_by_name(
            op_node.outputs, op_node.output_arg_names()[0])
859 860 861 862 863
        weight_scale_node = graph.create_persistable_node(
            name=unique_name.generate('channel_scale'),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[channel_scale.shape[0]],
            var_dtype=output_var_node.dtype())
864 865
        data_type = 'float64' if output_var_node.dtype(
        ) == core.VarDesc.VarType.FP64 else 'float32'
866 867 868
        _init_var_node(weight_scale_node,
                       channel_scale.astype(data_type), self._scope,
                       self._place)
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(output_var_node.name()),
            var_type=output_var_node.type(),
            shape=output_var_node.shape(),
            var_dtype=output_var_node.dtype())
        dequant_op_node = graph.create_op_node(
            op_type='fake_channel_wise_dequantize_max_abs',
            attrs={
                'quant_bits': [self._weight_bits, self._activation_bits],
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
            inputs={
                'X': output_var_node,
                'Scales': [weight_scale_node, scale_var_node]
            },
            outputs={'Out': dequant_var_node})
        graph.link_to(output_var_node, dequant_op_node)
        graph.link_to(scale_var_node, dequant_op_node)
        graph.link_to(weight_scale_node, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
889
        self._op_output_rename_map[output_var_node.node] = dequant_var_node
890 891
        return dequant_var_node

W
WangZhen 已提交
892
    def _insert_post_dequant_op(self, graph, op_node):
893
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
894 895 896
        max_range = 1
        param_range = (1 << (self._weight_bits - 1)) - 1
        act_range = (1 << (self._activation_bits - 1)) - 1
W
WangZhen 已提交
897
        for var_node in op_node.inputs:
W
WangZhen 已提交
898
            name = var_node.name()
899 900 901 902 903
            if name not in op_node.input_arg_names():
                continue
            if var_node.node in self._op_input_rename_map:
                old_in = var_node
                new_in = self._op_input_rename_map[var_node.node]
W
WangZhen 已提交
904
                new_in.clear_outputs()
W
WangZhen 已提交
905 906
                graph.update_input_link(old_in, new_in, op_node)
            original_var_name = self._original_var_name(name)
907
            scale_v = self._quant_var_scale_map[original_var_name]
W
WangZhen 已提交
908 909 910 911
            if original_var_name in persistable_vars:
                assert self._is_float(
                    scale_v), 'The scale of parameter %s is not a float.' % (
                        original_var_name)
912
                max_range *= param_range / scale_v
W
WangZhen 已提交
913
            else:
914
                max_range *= act_range
915
                assert isinstance(scale_v, IrNode)
916
                scale_var_node = self._quant_var_scale_map[original_var_name]
W
WangZhen 已提交
917

918
        if len(op_node.output_arg_names()) != 1:
W
WangZhen 已提交
919 920 921
            raise ValueError("Only support one output, but op %s has"
                             " more than one output." % (op_node.name()))

922 923
        output_var_node = graph._find_node_by_name(
            op_node.outputs, op_node.output_arg_names()[0])
W
WangZhen 已提交
924 925
        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(output_var_node.name()),
926 927 928
            var_type=output_var_node.type(),
            shape=output_var_node.shape(),
            var_dtype=output_var_node.dtype())
W
WangZhen 已提交
929 930
        dequant_op_node = graph.create_op_node(
            op_type='fake_dequantize_max_abs',
931 932 933 934
            attrs={
                'max_range': float(max_range),
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
W
WangZhen 已提交
935 936 937 938 939 940
            inputs={'X': output_var_node,
                    'Scale': scale_var_node},
            outputs={'Out': dequant_var_node})
        graph.link_to(output_var_node, dequant_op_node)
        graph.link_to(scale_var_node, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
941
        self._op_output_rename_map[output_var_node.node] = dequant_var_node
W
WangZhen 已提交
942 943 944 945 946
        return dequant_var_node

    def _load_var(self, name):
        return np.array(self._scope.find_var(name).get_tensor())

947 948 949
    def _restore_var(self, name, array):
        tensor = self._scope.find_var(name).get_tensor()
        tensor.set(array, self._place)
W
WangZhen 已提交
950 951 952

    def _remove_unused_var_nodes(self, graph):
        all_used_vars = set()
953
        ops = graph.all_op_nodes()
W
WangZhen 已提交
954 955 956 957 958 959
        for op_node in ops:
            for input_node in op_node.inputs:
                all_used_vars.add(input_node)
            for output_node in op_node.outputs:
                all_used_vars.add(output_node)

960 961 962 963 964 965
        all_used_vars = {n.node for n in all_used_vars}
        all_unused_vars = {
            n
            for n in filter(lambda node: node.node not in all_used_vars,
                            graph.all_var_nodes())
        }
W
WangZhen 已提交
966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
        graph.safe_remove_nodes(all_unused_vars)

    def _original_var_name(self, var_name):
        """
        Return the original variable name.
        """
        if var_name.endswith('.quantized.dequantized'):
            return var_name[:-len('.quantized.dequantized')]
        if var_name.endswith('.quantized'):
            return var_name[:-len('.quantized')]
        if var_name.endswith('.dequantized'):
            return var_name[:-len('.dequantized')]
        if var_name.endswith('.scale'):
            return var_name[:-len('.scale')]
        else:
            return var_name

    def _dequantized_var_name(self, var_name):
        """
        Return dequantized variable name for the input `var_name`.
        """
        return "%s.dequantized" % (var_name)

W
WangZhen 已提交
989
    def _is_float(self, v):
W
WangZhen 已提交
990 991 992
        return isinstance(v, float) or isinstance(v, np.float32) \
            or isinstance(v, np.float64)

W
WangZhen 已提交
993
    def _quant(self, x, scale, num_bits):
994 995 996 997 998 999
        if isinstance(scale, list):
            for i, s in enumerate(scale):
                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
            return x
        else:
            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
1000 1001 1002


class ConvertToInt8Pass(object):
1003
    def __init__(self, scope, place, quantizable_op_type=None):
1004 1005 1006 1007 1008 1009 1010
        """
        Convert the weights into int8_t type.

        Args:
            scope(fluid.Scope): scope is used to get the weight tensor values.
            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the
                8bits weight tensors.
1011 1012
            quantizable_op_type(list[str]): This input param will be removed latter. The pass
                will process all quantized op, so it is not necessary to set the input param.
1013
        """
1014 1015 1016 1017 1018 1019 1020 1021
        assert scope is not None, \
            'The scope cannot be set None.'
        assert place is not None, \
            'The place cannot be set None.'
        self._scope = scope
        self._place = place

    def apply(self, graph):
1022
        """
T
tianshuo78520a 已提交
1023 1024
        Convert weights' type of the graph. After that, the data type of the
        graph weights is int8_t.
1025 1026 1027

        Args:
            graph(IrGraph): the applied graph.
1028 1029
        Returns:
            None
1030
        """
1031 1032
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        ops = graph.all_op_nodes()
1033 1034
        input_map = {}
        for op_node in ops:
1035 1036
            if op_node.op().has_attr("quantization_type") and \
                op_node.op().attr("quantization_type") == "qat_with_weight":
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
                for var_node in op_node.inputs:
                    name = var_node.name()
                    if name in persistable_vars:
                        if name not in input_map:
                            int8_var_node = self._convert_to_int8(graph,
                                                                  var_node)
                            input_map[name] = int8_var_node
                        graph.update_input_link(var_node, input_map[name],
                                                op_node)

        # remove the unused var node in the graph
        self._remove_unused_var_nodes(graph)
Z
Zhen Wang 已提交
1049
        graph.resolve_hazard()
1050 1051 1052 1053
        return graph

    def _convert_to_int8(self, graph, var_node):
        int8_var_node_name = var_node.name() + ".int8"
1054
        int8_var_node = graph.create_persistable_node(
1055
            name=cpt.to_text(int8_var_node_name),
1056 1057
            var_type=var_node.type(),
            shape=var_node.shape(),
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
            var_dtype=core.VarDesc.VarType.INT8)
        array = self._load_var(var_node.name())
        self._scope.var(int8_var_node_name)
        self._store_var(int8_var_node_name, array, np.int8)
        return int8_var_node

    def _load_var(self, name):
        return np.array(self._scope.find_var(name).get_tensor())

    def _store_var(self, name, array, dtype):
        tensor = self._scope.find_var(name).get_tensor()
        tensor.set(array.astype(dtype), self._place)

    def _remove_unused_var_nodes(self, graph):
        all_used_vars = set()
1073
        ops = graph.all_op_nodes()
1074 1075 1076 1077 1078 1079
        for op_node in ops:
            for input_node in op_node.inputs:
                all_used_vars.add(input_node)
            for output_node in op_node.outputs:
                all_used_vars.add(output_node)

1080 1081 1082 1083 1084 1085
        all_used_vars = {n.node for n in all_used_vars}
        all_unused_vars = {
            n
            for n in filter(lambda node: node.node not in all_used_vars,
                            graph.all_var_nodes())
        }
1086 1087 1088 1089 1090
        graph.safe_remove_nodes(all_unused_vars)


class TransformForMobilePass(object):
    def __init__(self):
1091
        """
T
tianshuo78520a 已提交
1092
        This pass is used to convert the frozen graph for paddle-mobile execution.
1093
        """
1094 1095
        self._fake_quant_op_names = _fake_quant_op_list
        self._fake_dequant_op_names = _fake_dequant_op_list
1096 1097

    def apply(self, graph):
1098 1099 1100 1101 1102 1103 1104
        """
        Because paddle-mobile use `quantize` an `dequantize` as the names of
        quantize operator and dequantize operator, the `apply` function just
        realize this logic.

        Args:
            graph(IrGraph): the graph will be transformed.
1105 1106
        Returns:
            None
1107
        """
1108
        ops = graph.all_op_nodes()
1109 1110 1111
        for op_node in ops:
            name = op_node.name()
            if name in self._fake_quant_op_names:
1112
                op_node.set_type('quantize')
1113 1114 1115 1116 1117 1118 1119
                quant_node = graph.create_op_node_from_desc(op_node.op())
                for input_node in op_node.inputs:
                    graph.link_to(input_node, quant_node)
                for output_node in op_node.outputs:
                    graph.link_to(quant_node, output_node)
                graph.safe_remove_nodes(op_node)
            if name in self._fake_dequant_op_names:
1120
                op_node.set_type('dequantize')
1121 1122 1123 1124 1125 1126
                dequant_node = graph.create_op_node_from_desc(op_node.op())
                for input_node in op_node.inputs:
                    graph.link_to(input_node, dequant_node)
                for output_node in op_node.outputs:
                    graph.link_to(dequant_node, output_node)
                graph.safe_remove_nodes(op_node)
Z
Zhen Wang 已提交
1127
        graph.resolve_hazard()
1128
        return graph
1129 1130


1131
class OutScaleForTrainingPass(object):
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
    def __init__(self, scope=None, place=None, moving_rate=0.9):
        """
        This pass is used for calculating output scales of some operators.
        These output scales may be used by tensorRT or some other inference engines.

        Args:
            scope(fluid.Scope): The scope is used to initialize these new parameters.
            place(fluid.CPUPlace|fluid.CUDAPlace): The place is used to initialize new parameters.
            moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
        """
        self._scope = scope
        self._place = place
        self._moving_rate = moving_rate
        self._is_test = None
1146
        self._teller_set = _out_scale_op_list
1147 1148 1149 1150 1151 1152 1153 1154 1155

    def apply(self, graph):
        """
        Insert the `moving_average_abs_max_scale` op in order to calculate output scales
        of operators in the teller_set.

        Args:
            graph(IrGraph): the target graph.
        """
1156 1157
        assert isinstance(graph,
                          IrGraph), 'graph must be the instance of IrGraph.'
1158
        self._is_test = graph.is_test()
1159 1160 1161 1162 1163 1164 1165
        target_ops = []
        for op in graph.all_op_nodes():
            if op.name() in self._teller_set:
                target_ops.append(op)
        for op in target_ops:
            for output_var_name in _get_op_output_var_names(op):
                in_node = graph._find_node_by_name(op.outputs, output_var_name)
1166 1167 1168 1169 1170 1171
                out_node = graph.create_var_node_from_desc(in_node.var())
                scale_node = graph.create_persistable_node(
                    name=self._scale_name(in_node.name()),
                    var_type=core.VarDesc.VarType.LOD_TENSOR,
                    shape=[1],
                    var_dtype=in_node.dtype())
1172 1173 1174 1175 1176 1177 1178 1179
                data_type = 'float64' if in_node.dtype() \
                    == core.VarDesc.VarType.FP64 else 'float32'
                _init_var_node(
                    scale_node,
                    np.ones(
                        [1], dtype=data_type),
                    self._scope,
                    self._place)
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
                ins = {'X': in_node}
                outs = {'Out': out_node, 'OutScale': scale_node}
                if not self._is_test:
                    state_in_node = graph.create_persistable_node(
                        name=unique_name.generate('scale_state@'),
                        var_type=core.VarDesc.VarType.LOD_TENSOR,
                        var_dtype=in_node.dtype(),
                        shape=[1])
                    _init_var_node(
                        state_in_node,
                        np.ones(
                            [1], dtype=data_type),
                        self._scope,
                        self._place)
                    accum_in_node = graph.create_persistable_node(
                        name=unique_name.generate('scale_accum@'),
                        var_type=core.VarDesc.VarType.LOD_TENSOR,
                        var_dtype=in_node.dtype(),
                        shape=[1])
                    _init_var_node(
                        accum_in_node,
                        np.ones(
                            [1], dtype=data_type),
                        self._scope,
                        self._place)
                    state_out_node = graph.create_var_node_from_desc(
                        state_in_node.var())
                    accum_out_node = graph.create_var_node_from_desc(
                        accum_in_node.var())

                    ins['InState'] = state_in_node
                    ins['InAccum'] = accum_in_node
                    outs['OutState'] = state_out_node
                    outs['OutAccum'] = accum_out_node

                attrs = {
                    'moving_rate': self._moving_rate,
                    'is_test': self._is_test,
                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
                }
                scale_op_node = graph.create_op_node(
                    op_type='moving_average_abs_max_scale',
                    attrs=attrs,
                    inputs=ins,
                    outputs=outs)
                graph.link_to(in_node, scale_op_node)
                graph.link_to(scale_op_node, out_node)
                graph.link_to(scale_op_node, scale_node)
                if not self._is_test:
                    graph.link_to(state_in_node, scale_op_node)
                    graph.link_to(accum_in_node, scale_op_node)
                    graph.link_to(scale_op_node, state_out_node)
                    graph.link_to(scale_op_node, accum_out_node)
        graph.resolve_hazard()
        return graph

    def _scale_name(self, var_name):
        """
        Return the scale name for the var named `var_name`.
        """
        return "%s@scale" % (var_name)


1243
class OutScaleForInferencePass(object):
1244 1245 1246 1247 1248 1249 1250 1251 1252
    def __init__(self, scope=None):
        """
        This pass is used for setting output scales of some operators.
        These output scales may be used by tensorRT or some other inference engines.

        Args:
            scope(fluid.Scope): The scope is used to initialize these new parameters.
        """
        self._scope = scope
1253
        self._teller_set = _out_scale_op_list
1254 1255 1256 1257 1258 1259 1260 1261 1262

    def apply(self, graph):
        """
        Get output scales from the scope and set these scales in op_descs
        of operators in the teller_set.

        Args:
            graph(IrGraph): the target graph.
        """
1263 1264
        assert isinstance(graph,
                          IrGraph), 'graph must be the instance of IrGraph.'
1265 1266 1267 1268 1269 1270 1271
        op_nodes = graph.all_op_nodes()
        for op_node in op_nodes:
            if op_node.name() in self._teller_set:
                output_var_name = _get_op_output_var_names(op_node)
                assert len(output_var_name) == 1, "Only support collecting " \
                    "output for op that only has an activation output for now."
                scale_name = self._scale_name(output_var_name[0])
1272 1273
                scale_v = np.array(
                    self._scope.find_var(scale_name).get_tensor())[0]
1274
                op_node.op()._set_attr("out_threshold", float(scale_v))
1275 1276 1277 1278 1279 1280 1281 1282
        graph.resolve_hazard()
        return graph

    def _scale_name(self, var_name):
        """
        Return the scale name for the var named `var_name`.
        """
        return "%s@scale" % (var_name)
1283 1284 1285


class AddQuantDequantPass(object):
1286 1287 1288 1289
    """
    Quantize the ops that do not have weights, and add quant_dequant op for the 
    quantized ops's inputs.
    """
1290 1291 1292 1293 1294
    _supported_quantizable_op_type = [
        "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose",
        "equal", "gather", "greater_equal", "greater_than", "less_equal",
        "less_than", "mean", "not_equal", "reshape", "reshape2",
        "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
1295 1296
        "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
        "leaky_relu", "tanh", "swish"
1297 1298
    ]

1299 1300 1301
    # To be compatible with PaddleSlim, not remove _activation_type for now
    _activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"]

1302 1303 1304 1305 1306
    def __init__(self,
                 scope=None,
                 place=None,
                 moving_rate=0.9,
                 quant_bits=8,
1307
                 skip_pattern=["skip_quant"],
1308
                 quantizable_op_type=["elementwise_add", "pool2d"],
1309
                 is_full_quantized=False):
1310
        """
1311
        Constructor.
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324

        Args:
            scope(fluid.Scope): The scope is used to initialize these new parameters.
            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to initialize new
                parameters described above.
            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max' 
                quantization. Default is 0.9.
            quant_bits(int, optional): quantization bit number for activation. Default is 8.
            skip_pattern(str, optional): The user-defined quantization skip pattern, which
                will be presented in the name scope of an op. When the skip pattern is
                detected in an op's name scope, the corresponding op will not be quantized.
                Default is 'skip_quant'.
            quantizable_op_type(list[str], optional): List the type of ops that will be 
1325
                quantized. Default is ["elementwise_add", "pool2d"]. 
1326 1327 1328 1329
            is_full_quantized(bool, optional): If set is_full_quantized as True, apply 
                quantization to all supported quantizable op type. If set is_full_quantized
                as False, only apply quantization to the op type according to the input 
                quantizable_op_type.
1330 1331 1332 1333 1334 1335
        """
        self._scope = scope
        self._place = place
        self._moving_rate = moving_rate
        self._quant_bits = quant_bits
        self._is_test = None
1336
        self._skip_pattern = skip_pattern
1337 1338 1339 1340 1341 1342 1343

        if is_full_quantized:
            self._quantizable_op_type = \
                AddQuantDequantPass._supported_quantizable_op_type
        else:
            self._quantizable_op_type = quantizable_op_type
            for op_type in quantizable_op_type:
1344
                assert op_type in AddQuantDequantPass._supported_quantizable_op_type, \
1345
                    op_type + " is not supported for quantization."
1346 1347 1348 1349
        self._quantizable_grad_op_type = [
            '%s_grad' % (op) for op in self._quantizable_op_type
        ]

1350 1351
        assert self._scope != None, "scope must not be None."
        assert self._place != None, "place must not be None."
1352 1353 1354

    def apply(self, graph):
        """
1355 1356
        Add quant_dequant before some ops, such as the 'elementwise_add' and
        'pool2d' op.
1357

1358 1359
        Args:
            graph(IrGraph): the target graph.
1360 1361
        Returns:
            None
1362 1363 1364 1365
        """
        assert isinstance(graph,
                          IrGraph), 'graph must be the instance of IrGraph.'
        self._is_test = graph.is_test()
1366 1367
        dequantized_vars_map = collections.OrderedDict()

1368 1369 1370
        # Forward stage, insert quant_dequant op
        all_op_nodes = graph.all_op_nodes()
        for op_node in all_op_nodes:
1371
            if op_node.name() in self._quantizable_op_type:
1372
                is_skip = False
1373
                if isinstance(self._skip_pattern, list):
1374
                    is_skip = op_node.op().has_attr("op_namescope") and \
1375 1376
                                   any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
                elif isinstance(self._skip_pattern, str):
1377
                    is_skip = op_node.op().has_attr("op_namescope") and \
1378
                                   op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
1379 1380 1381
                is_quantized = op_node.op().has_attr("quantization_type") and \
                    op_node.op().attr("quantization_type") == "qat_with_weight"
                if is_skip or is_quantized or \
1382
                    (not _is_input_all_not_persistable(graph, op_node)):
1383
                    continue
1384

1385 1386 1387
                op_node.op()._set_attr("quantization_type",
                                       "qat_without_weight")
                op_node.op()._set_attr("activation_bits", self._quant_bits)
1388
                arg_names = _get_op_input_var_names(op_node)
1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
                for arg_name in arg_names:
                    in_node = graph._find_node_by_name(op_node.inputs, arg_name)
                    if arg_name in dequantized_vars_map:
                        quant_var_node = dequantized_vars_map[arg_name]
                    else:
                        quant_var_node, _ = \
                            self._inser_quant_dequant_moving_average_abs_max_op(
                            graph, in_node, self._quant_bits)
                        dequantized_vars_map[arg_name] = quant_var_node
                    graph.update_input_link(in_node, quant_var_node, op_node)
1399

1400 1401
        # Backward stage, update input link
        for op_node in all_op_nodes:
1402
            if op_node.name() in self._quantizable_grad_op_type:
1403 1404 1405 1406 1407 1408 1409 1410
                for input_name in op_node.input_arg_names():
                    if input_name in dequantized_vars_map:
                        in_node = graph._find_node_by_name(op_node.inputs,
                                                           input_name)
                        dequant_var_node = dequantized_vars_map[input_name]
                        graph.update_input_link(in_node, dequant_var_node,
                                                op_node)

1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
        graph.resolve_hazard()
        return graph

    def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
                                                       quant_bits):
        """Insert fake_quantize_dequantize_moving_average_abs_max op.
        """
        quant_var_node = graph.create_var_node(
            name="{}.quant_dequant".format(var_node.name()),
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=var_node.dtype())
        scale_in_node = graph.create_persistable_node(
            name="{}.quant_dequant.scale".format(var_node.name()),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
            var_dtype=var_node.dtype())
        data_type = 'float64' if var_node.dtype(
        ) == core.VarDesc.VarType.FP64 else 'float32'
        _init_var_node(
            scale_in_node,
            np.array(
                [0.001], dtype=data_type),
            self._scope,
            self._place)

        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
        ins = {'X': var_node, 'InScale': scale_in_node}
        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
        if not self._is_test:
            state_in_node = graph.create_persistable_node(
                name=unique_name.generate('quant_dequant.state'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
            data_type = 'float64' if var_node.dtype(
            ) == core.VarDesc.VarType.FP64 else 'float32'
            _init_var_node(
                state_in_node,
                np.ones(
                    [1], dtype=data_type),
                self._scope,
                self._place)
            accum_in_node = graph.create_persistable_node(
                name=unique_name.generate('quant_dequant.accum'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
            _init_var_node(
                accum_in_node,
                np.ones(
                    [1], dtype=data_type),
                self._scope,
                self._place)
            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
            ))
            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
            ))

            ins['InState'] = state_in_node
            ins['InAccum'] = accum_in_node
            outs['OutState'] = state_out_node
            outs['OutAccum'] = accum_out_node

        attrs = {
            'bit_length': quant_bits,
            'moving_rate': self._moving_rate,
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
        }

        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_dequantize_moving_average_abs_max',
            attrs=attrs,
            inputs=ins,
            outputs=outs)

        graph.link_to(var_node, quant_op_node)
        graph.link_to(scale_in_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        graph.link_to(quant_op_node, scale_out_node)

        if not self._is_test:
            graph.link_to(state_in_node, quant_op_node)
            graph.link_to(accum_in_node, quant_op_node)
            graph.link_to(quant_op_node, state_out_node)
            graph.link_to(quant_op_node, accum_out_node)

        return quant_var_node, scale_out_node