quant_layers.py 38.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
import logging

17
import paddle
18
from paddle import _legacy_C_ops, in_dynamic_mode
19
from paddle.fluid.data_feeder import check_variable_and_dtype
20
from paddle.fluid.framework import _varbase_creator
21
from paddle.fluid.log_helper import get_logger
22
from paddle.framework import ParamAttr, core
Z
zhiboniu 已提交
23
from paddle.nn import Layer
24 25
from paddle.nn import functional as F
from paddle.nn.initializer import Constant
C
Chang Xu 已提交
26
from paddle.nn.quant.lsq import FakeQuantActLSQPlus, FakeQuantWeightLSQPlus
27
from paddle.utils import unique_name
28 29

__all__ = [
30
    'FakeQuantAbsMax',
31
    'FakeQuantMovingAverageAbsMax',
32 33
    'FakeQuantChannelWiseAbsMax',
    'QuantizedConv2D',
34
    'QuantizedConv2DTranspose',
35 36 37 38
    'QuantizedLinear',
    'MovingAverageAbsMaxScale',
    'MAOutputScaleLayer',
    'FakeQuantMAOutputScaleLayer',
39
    'QuantStub',
40 41
    'QuantizedRowParallelLinear',
    'QuantizedColumnParallelLinear',
C
Chang Xu 已提交
42
    'QuantizedMatmul',
43 44
]

45 46 47
_logger = get_logger(
    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
)
48

49

Z
zhiboniu 已提交
50
class FakeQuantAbsMax(Layer):
51 52 53 54 55 56 57 58 59
    r"""
    FakeQuantAbsMax layer does the abs_max quant and then dequant.
    Its computational formula is described as below:

    :math:`scale = max(abs(X))`
    :math:`range = 2^{bit\_length - 1} - 1`
    :math:`Out = round(X / scale * range) * scale / range`
    """

60 61 62 63 64 65 66 67
    def __init__(
        self,
        name=None,
        quant_bits=8,
        dtype='float32',
        quant_on_weight=False,
        reduce_type=None,
    ):
68
        super().__init__()
69 70
        self._quant_bits = quant_bits
        self._name = name
71
        self._reduce_type = reduce_type
72 73 74
        scale_prefix = (
            "{}.scale".format(name) if name else 'quant_dequant.scale'
        )
75 76
        self._scale_name = unique_name.generate(scale_prefix)
        if quant_on_weight:
77 78 79 80 81 82 83 84
            scale_attr = ParamAttr(
                name=self._scale_name,
                initializer=Constant(0.001),
                trainable=False,
            )
            self._scale = self.create_parameter(
                shape=[1], attr=scale_attr, dtype=self._dtype
            )
85 86 87 88 89
            self._scale.stop_gradient = True
        else:
            self._scale = None

    def forward(self, input):
Z
zhiboniu 已提交
90
        if in_dynamic_mode():
91
            attrs = ('bit_length', self._quant_bits)
92 93 94 95 96 97 98
            quant_out = _varbase_creator(
                type=input.type,
                name="{}.quantized.dequantized".format(input.name),
                shape=input.shape,
                dtype=input.dtype,
                persistable=False,
            )
99
            out_scale = self._scale
100 101
            if self._reduce_type == "max":
                paddle.distributed.all_reduce(
102 103
                    out_scale, op=paddle.distributed.ReduceOp.MAX
                )
104

105 106 107 108 109 110
            if not out_scale:
                out_scale = _varbase_creator(
                    type=core.VarDesc.VarType.LOD_TENSOR,
                    name=self._scale_name,
                    shape=[1],
                    dtype=self._dtype,
111 112
                    persistable=False,
                )
113
                out_scale.stop_gradient = True
114
            out, _, = _legacy_C_ops.fake_quantize_dequantize_abs_max(
115 116
                input, quant_out, out_scale, *attrs
            )
117 118 119 120 121 122 123 124 125 126
            return out

        check_variable_and_dtype(input, 'input', ['float32'], "FakeQuantAbsMax")
        attrs = {'bit_length': self._quant_bits}
        inputs = {"X": [input]}
        quant_out = self._helper.create_variable(
            name="{}.quantized.dequantized".format(input.name),
            dtype=input.dtype,
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
127 128
            stop_gradient=False,
        )
129 130 131 132 133 134 135
        out_scale = self._scale
        if not out_scale:
            out_scale = self._helper.create_variable(
                name=self._scale_name,
                dtype=self._dtype,
                type=core.VarDesc.VarType.LOD_TENSOR,
                persistable=False,
136 137
                stop_gradient=True,
            )
138 139
        outputs = {"Out": [quant_out], "OutScale": [out_scale]}

140 141 142 143 144 145
        self._helper.append_op(
            type="fake_quantize_dequantize_abs_max",
            inputs=inputs,
            outputs=outputs,
            attrs=attrs,
        )
146 147 148 149

        return quant_out


Z
zhiboniu 已提交
150
class FakeQuantMovingAverageAbsMax(Layer):
151
    r"""
152
    FakeQuantMovingAverageAbsMax layer does the moving_average_abs_max quant and then dequant.
153 154 155 156 157 158 159
    Its computational formula is described as below:

    :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
    :math:`range = 2^{bit\_length - 1} - 1`
    :math:`Out = round(X / scale * range) * scale / range`
    """

160 161 162 163 164 165 166 167
    def __init__(
        self,
        name=None,
        moving_rate=0.9,
        quant_bits=8,
        dtype='float32',
        reduce_type=None,
    ):
168
        super().__init__()
169 170
        self._moving_rate = moving_rate
        self._quant_bits = quant_bits
171
        self._reduce_type = reduce_type
172 173 174 175 176 177 178 179 180 181 182
        scale_prefix = (
            "{}.scale".format(name) if name else 'quant_dequant.scale'
        )
        scale_attr = ParamAttr(
            name=unique_name.generate(scale_prefix),
            initializer=Constant(0.001),
            trainable=False,
        )
        self._scale = self.create_parameter(
            shape=[1], attr=scale_attr, dtype=dtype
        )
183 184
        self._scale.stop_gradient = True

185 186 187 188 189 190 191 192 193 194 195
        state_prefix = (
            "{}.state".format(name) if name else 'quant_dequant.state'
        )
        state_attr = ParamAttr(
            name=unique_name.generate(state_prefix),
            initializer=Constant(1),
            trainable=False,
        )
        self._state = self.create_parameter(
            shape=[1], attr=state_attr, dtype=dtype
        )
196 197
        self._state.stop_gradient = True

198 199 200 201 202 203 204 205 206 207 208
        accum_prefix = (
            "{}.accum".format(name) if name else 'quant_dequant.accum'
        )
        accum_attr = ParamAttr(
            name=unique_name.generate(accum_prefix),
            initializer=Constant(1),
            trainable=False,
        )
        self._accum = self.create_parameter(
            shape=[1], attr=accum_attr, dtype=dtype
        )
209 210 211
        self._accum.stop_gradient = True

    def forward(self, input):
Z
zhiboniu 已提交
212
        if in_dynamic_mode():
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
            attrs = (
                'moving_rate',
                self._moving_rate,
                'bit_length',
                self._quant_bits,
                'is_test',
                not self.training,
            )
            quant_out = _varbase_creator(
                type=input.type,
                name="{}.quantized.dequantized".format(input.name),
                shape=input.shape,
                dtype=input.dtype,
                persistable=False,
            )
228 229
            if self._reduce_type == "max":
                paddle.distributed.all_reduce(
230 231
                    self._scale, op=paddle.distributed.ReduceOp.MAX
                )
232

233 234 235
            state = self._state if self.training else None
            accum = self._accum if self.training else None

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
            (
                out,
                _,
                _,
                _,
            ) = _legacy_C_ops.fake_quantize_dequantize_moving_average_abs_max(
                input,
                self._scale,
                accum,
                state,
                quant_out,
                self._scale,
                state,
                accum,
                *attrs
            )
252

253 254
            return out

255 256 257
        check_variable_and_dtype(
            input, 'input', ['float32'], "FakeQuantMovingAverageAbsMax"
        )
258 259 260
        attrs = {
            'moving_rate': self._moving_rate,
            'bit_length': self._quant_bits,
261
            'is_test': not self.training,
262 263 264 265 266 267 268
        }
        inputs = {"X": [input], "InScale": [self._scale]}
        quant_out = self._helper.create_variable(
            name="{}.quantized.dequantized".format(input.name),
            dtype=input.dtype,
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
269 270
            stop_gradient=False,
        )
271 272 273 274 275 276 277 278 279 280 281 282
        outputs = {"Out": [quant_out], "OutScale": [self._scale]}

        if self.training:
            inputs['InState'] = [self._state]
            inputs['InAccum'] = [self._accum]
            outputs['OutState'] = [self._state]
            outputs['OutAccum'] = [self._accum]

        self._helper.append_op(
            type="fake_quantize_dequantize_moving_average_abs_max",
            inputs=inputs,
            outputs=outputs,
283 284
            attrs=attrs,
        )
285 286 287 288

        return quant_out


Z
zhiboniu 已提交
289
class FakeQuantChannelWiseAbsMax(Layer):
290 291 292 293 294 295 296 297 298 299 300
    def __init__(
        self,
        name=None,
        channel_num=None,
        quant_bits=8,
        quant_axis=0,
        dtype='float32',
        quant_on_weight=False,
        reduce_type=None,
    ):
        assert (
301
            quant_on_weight
302
        ), "Channel_wise only can be used on weight quantization."
303
        super().__init__()
304
        self._quant_bits = quant_bits
305 306
        self._quant_axis = quant_axis
        self._dtype = dtype
307
        self._name = name
308
        self._channel_num = channel_num
309
        self._reduce_type = reduce_type
310 311 312
        scale_prefix = (
            "{}.scale".format(name) if name else 'quant_dequant.scale'
        )
313 314
        self._scale_name = unique_name.generate(scale_prefix)
        if quant_on_weight:
315 316 317 318 319 320 321 322
            scale_attr = ParamAttr(
                name=self._scale_name,
                initializer=Constant(0.0),
                trainable=False,
            )
            self._scale = self.create_parameter(
                shape=[self._channel_num], attr=scale_attr, dtype=self._dtype
            )
323 324 325 326 327
            self._scale.stop_gradient = True
        else:
            self._scale = None

    def forward(self, input):
Z
zhiboniu 已提交
328
        if in_dynamic_mode():
329 330 331 332 333 334 335 336 337 338 339 340 341
            attrs = (
                'bit_length',
                self._quant_bits,
                'quant_axis',
                self._quant_axis,
            )
            quant_out = _varbase_creator(
                type=input.type,
                name="{}.quantized.dequantized".format(input.name),
                shape=input.shape,
                dtype=input.dtype,
                persistable=False,
            )
342

343
            out_scale = self._scale
344 345
            if self._reduce_type == "max":
                paddle.distributed.all_reduce(
346 347
                    out_scale, op=paddle.distributed.ReduceOp.MAX
                )
348
            if out_scale is None:
349 350 351
                out_scale = _varbase_creator(
                    type=core.VarDesc.VarType.LOD_TENSOR,
                    name=self._scale_name,
352
                    shape=[self._channel_num],
353
                    dtype=self._dtype,
354 355
                    persistable=False,
                )
356
                out_scale.stop_gradient = True
357

358 359 360 361 362 363
            (
                out,
                _,
            ) = _legacy_C_ops.fake_channel_wise_quantize_dequantize_abs_max(
                input, quant_out, out_scale, *attrs
            )
364 365
            return out

366 367 368
        check_variable_and_dtype(
            input, 'input', ['float32'], "FakeQuantChannelWiseAbsMax"
        )
369
        attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
370 371 372 373 374 375
        inputs = {"X": [input]}
        quant_out = self._helper.create_variable(
            name="{}.quantized.dequantized".format(input.name),
            dtype=input.dtype,
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
376 377
            stop_gradient=False,
        )
378 379 380 381 382 383 384
        out_scale = self._scale
        if not out_scale:
            out_scale = self._helper.create_variable(
                name=self._scale_name,
                dtype=self._dtype,
                type=core.VarDesc.VarType.LOD_TENSOR,
                persistable=False,
385 386
                stop_gradient=True,
            )
387 388 389
        outputs = {"Out": [quant_out], "OutScale": [out_scale]}

        self._helper.append_op(
390
            type="fake_channel_wise_quantize_dequantize_abs_max",
391 392
            inputs=inputs,
            outputs=outputs,
393 394
            attrs=attrs,
        )
395 396 397 398

        return quant_out


Z
zhiboniu 已提交
399
class MovingAverageAbsMaxScale(Layer):
400 401 402
    def __init__(
        self, name=None, moving_rate=0.9, dtype='float32', reduce_type=None
    ):
403 404 405 406 407 408 409
        r"""
        MovingAverageMaxScale layer is used to calculating the output quantization
        scale of Layer. Its computational formula is described as below:

        :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
        :math:`Out = X`
        """
410
        super().__init__()
411
        self._moving_rate = moving_rate
412
        self._reduce_type = reduce_type
413 414
        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
        scale_name = unique_name.generate(scale_prefix)
415 416 417 418 419 420
        scale_attr = ParamAttr(
            name=scale_name, initializer=Constant(0), trainable=False
        )
        self._scale = self.create_parameter(
            shape=[1], attr=scale_attr, dtype=dtype
        )
421 422 423
        self._scale.stop_gradient = True

        state_prefix = "{}.state".format(name) if name else 'outscale.state'
424 425 426 427 428 429 430 431
        state_attr = ParamAttr(
            name=unique_name.generate(state_prefix),
            initializer=Constant(0),
            trainable=False,
        )
        self._state = self.create_parameter(
            shape=[1], attr=state_attr, dtype=dtype
        )
432 433 434
        self._state.stop_gradient = True

        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
435 436 437 438 439 440 441 442
        accum_attr = ParamAttr(
            name=unique_name.generate(accum_prefix),
            initializer=Constant(0),
            trainable=False,
        )
        self._accum = self.create_parameter(
            shape=[1], attr=accum_attr, dtype=dtype
        )
443
        self._accum.stop_gradient = True
H
huangxu96 已提交
444 445

    def forward(self, input):
Z
zhiboniu 已提交
446
        if in_dynamic_mode():
447 448 449 450 451 452 453 454 455 456 457 458 459 460
            attrs = (
                'moving_rate',
                self._moving_rate,
                'is_test',
                not self.training,
            )

            quant_out = _varbase_creator(
                type=input.type,
                name="{}.tmp".format(input.name),
                shape=input.shape,
                dtype=input.dtype,
                persistable=False,
            )
461 462
            if self._reduce_type == "max":
                paddle.distributed.all_reduce(
463 464
                    self._scale, op=paddle.distributed.ReduceOp.MAX
                )
465 466 467

            state = self._state if self.training else None
            accum = self._accum if self.training else None
H
huangxu96 已提交
468

469
            out, _, _, _ = _legacy_C_ops.moving_average_abs_max_scale(
470 471 472 473 474 475 476 477 478
                input,
                accum,
                state,
                quant_out,
                self._scale,
                state,
                accum,
                *attrs
            )
H
huangxu96 已提交
479 480
            return out

481 482 483
        check_variable_and_dtype(
            input, 'input', ['float32', 'float64'], 'MovingAverageAbsMaxScale'
        )
484 485

        attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
H
huangxu96 已提交
486 487
        inputs = {"X": [input]}
        quant_out = self._helper.create_variable(
488
            name="{}.tmp".format(input.name),
H
huangxu96 已提交
489 490 491
            dtype=input.dtype,
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
492 493
            stop_gradient=False,
        )
494 495 496 497 498 499 500
        outputs = {"Out": [quant_out], "OutScale": [self._scale]}

        if self.training:
            inputs['InState'] = [self._state]
            inputs['InAccum'] = [self._accum]
            outputs['OutState'] = [self._state]
            outputs['OutAccum'] = [self._accum]
H
huangxu96 已提交
501

502 503 504 505 506 507
        self._helper.append_op(
            type="moving_average_abs_max_scale",
            inputs=inputs,
            outputs=outputs,
            attrs=attrs,
        )
H
huangxu96 已提交
508 509 510 511

        return quant_out


512
QuantStub = MovingAverageAbsMaxScale
513 514


Z
zhiboniu 已提交
515
class QuantizedConv2D(Layer):
516 517 518 519 520
    """
    The computational logic of QuantizedConv2D is the same with Conv2D.
    The only difference is that its inputs are all fake quantized.
    """

521 522 523 524 525 526 527 528 529 530 531 532 533
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
534
        super().__init__()
535 536 537 538
        # For Conv2D
        self._groups = getattr(layer, '_groups')
        self._stride = getattr(layer, '_stride')
        self._padding = getattr(layer, '_padding')
H
huangxu96 已提交
539 540 541
        self._padding_mode = getattr(layer, '_padding_mode')
        if self._padding_mode != 'zeros':
            self._reversed_padding_repeated_twice = getattr(
542 543
                layer, '_reversed_padding_repeated_twice'
            )
544
        self._dilation = getattr(layer, '_dilation')
H
huangxu96 已提交
545
        self._data_format = getattr(layer, '_data_format')
546 547
        self.weight = getattr(layer, 'weight')
        self.bias = getattr(layer, 'bias')
H
huangxu96 已提交
548

549
        # For FakeQuant
H
huangxu96 已提交
550
        self._conv2d_quant_axis = 0
551 552 553 554 555 556 557 558 559 560 561
        if weight_quant_layer is not None:
            self._fake_quant_weight = weight_quant_layer()
        else:
            self._fake_quant_weight = _get_fake_quant_type(
                weight_quantize_type,
                name=self.weight.name,
                moving_rate=moving_rate,
                quant_bits=weight_bits,
                dtype=self._dtype,
                quant_on_weight=True,
                channel_num=self.weight.shape[self._conv2d_quant_axis],
562 563
                quant_axis=self._conv2d_quant_axis,
            )
564 565 566 567 568 569 570 571 572
        if act_quant_layer is not None:
            self._fake_quant_input = act_quant_layer()
        else:
            self._fake_quant_input = _get_fake_quant_type(
                activation_quantize_type,
                name=layer.full_name(),
                moving_rate=moving_rate,
                quant_bits=activation_bits,
                dtype=self._dtype,
573 574
                quant_on_weight=False,
            )
575

576 577 578 579 580 581
        self._act_preprocess = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._weight_preprocess = (
            weight_pre_layer() if weight_pre_layer is not None else None
        )
582 583

    def forward(self, input):
584 585
        if self._act_preprocess is not None:
            input = self._act_preprocess(input)
586
        quant_input = self._fake_quant_input(input)
587 588 589 590 591

        weight = self.weight
        if self._weight_preprocess is not None:
            weight = self._weight_preprocess(self.weight)
        quant_weight = self._fake_quant_weight(weight)
592

H
huangxu96 已提交
593
        if self._padding_mode != 'zeros':
594 595 596 597 598 599
            quant_input = F.pad(
                quant_input,
                self._reversed_padding_repeated_twice,
                mode=self._padding_mode,
                data_format=self._data_format,
            )
H
huangxu96 已提交
600
            self._padding = 0
601

602 603 604 605 606 607 608 609 610 611
        return F.conv2d(
            quant_input,
            quant_weight,
            bias=self.bias,
            padding=self._padding,
            stride=self._stride,
            dilation=self._dilation,
            groups=self._groups,
            data_format=self._data_format,
        )
612 613


Z
zhiboniu 已提交
614
class QuantizedConv2DTranspose(Layer):
615
    """
616

617 618
    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
    The only difference is that its inputs are all fake quantized.
619

620 621
    Examples:
       .. code-block:: python
622

623 624 625
          import paddle
          import paddle.nn as nn
          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
626

627 628 629 630 631
          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
          conv = nn.Conv2DTranspose(4, 6, (3, 3))
          conv_quantized = QuantizedConv2DTranspose(conv)
          y_quantized = conv_quantized(x_var)
          y_var = conv(x_var)
632 633
          print(y_var.shape, y_quantized.shape)
          # [2, 6, 10, 10], [2, 6, 10, 10]
634

635 636
    """

637 638 639 640 641 642 643 644 645 646 647 648 649
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
650 651 652 653 654
        r"""
        Constructor.

        The arguments are the same as ImperativeQuantAware.
        """
655
        super().__init__()
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
        # For Conv2DTranspose
        self._groups = getattr(layer, '_groups')
        self._stride = getattr(layer, '_stride')
        self._padding = getattr(layer, '_padding')
        self._output_padding = getattr(layer, 'output_padding')
        self._dilation = getattr(layer, '_dilation')
        self._data_format = getattr(layer, '_data_format')
        self.weight = getattr(layer, 'weight')
        self.bias = getattr(layer, 'bias')
        # For FakeQuant
        self._conv2d_transpose_quant_axis = 1
        if weight_quant_layer is not None:
            self._fake_quant_weight = weight_quant_layer()
        else:
            self._fake_quant_weight = _get_fake_quant_type(
                weight_quantize_type,
                name=self.weight.name,
                moving_rate=moving_rate,
                quant_bits=weight_bits,
                dtype=self._dtype,
                quant_on_weight=True,
                channel_num=self.weight.shape[
678 679 680 681
                    self._conv2d_transpose_quant_axis
                ],
                quant_axis=self._conv2d_transpose_quant_axis,
            )
682 683 684 685 686 687 688 689 690
        if act_quant_layer is not None:
            self._fake_quant_input = act_quant_layer()
        else:
            self._fake_quant_input = _get_fake_quant_type(
                activation_quantize_type,
                name=layer.full_name(),
                moving_rate=moving_rate,
                quant_bits=activation_bits,
                dtype=self._dtype,
691 692
                quant_on_weight=False,
            )
693

694 695 696 697 698 699
        self._act_preprocess = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._weight_preprocess = (
            weight_pre_layer() if weight_pre_layer is not None else None
        )
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715

    def forward(self, input, output_size=None):
        if self._act_preprocess is not None:
            input = self._act_preprocess(input)
        quant_input = self._fake_quant_input(input)

        weight = self.weight
        if self._weight_preprocess is not None:
            weight = self._weight_preprocess(self.weight)
        quant_weight = self._fake_quant_weight(weight)

        if output_size is None:
            output_padding = self._output_padding
        else:
            output_padding = 0

716 717 718 719 720 721 722 723 724 725 726 727
        return F.conv2d_transpose(
            quant_input,
            quant_weight,
            bias=self.bias,
            padding=self._padding,
            output_padding=output_padding,
            stride=self._stride,
            dilation=self._dilation,
            groups=self._groups,
            output_size=output_size,
            data_format=self._data_format,
        )
728 729


Z
zhiboniu 已提交
730
class QuantizedLinear(Layer):
731 732 733 734 735
    """
    The computational logic of QuantizedLinear is the same with Linear.
    The only difference is that its inputs are all fake quantized.
    """

736 737 738 739 740 741 742 743 744 745 746 747 748
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
749
        super().__init__()
750 751 752
        # For Linear
        self.weight = getattr(layer, 'weight')
        self.bias = getattr(layer, 'bias')
H
huangxu96 已提交
753
        self.name = getattr(layer, 'name')
754
        # For FakeQuant
H
huangxu96 已提交
755
        self._linear_quant_axis = 1
756 757 758 759 760 761 762 763 764 765 766 767

        if weight_quant_layer is not None:
            self._fake_quant_weight = weight_quant_layer()
        else:
            self._fake_quant_weight = _get_fake_quant_type(
                weight_quantize_type,
                name=self.weight.name,
                moving_rate=moving_rate,
                quant_bits=weight_bits,
                dtype=self._dtype,
                quant_on_weight=True,
                channel_num=self.weight.shape[self._linear_quant_axis],
C
Chang Xu 已提交
768
                quant_axis=self._linear_quant_axis,
769 770
                quant_linear=True,
            )
771 772 773 774 775 776 777 778 779 780

        if act_quant_layer is not None:
            self._fake_quant_input = act_quant_layer()
        else:
            self._fake_quant_input = _get_fake_quant_type(
                activation_quantize_type,
                name=layer.full_name(),
                moving_rate=moving_rate,
                quant_bits=activation_bits,
                dtype=self._dtype,
781 782
                quant_on_weight=False,
            )
783

784 785 786 787 788 789
        self._act_preprocess = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._weight_preprocess = (
            weight_pre_layer() if weight_pre_layer is not None else None
        )
790 791

    def forward(self, input):
792 793
        if self._act_preprocess is not None:
            input = self._act_preprocess(input)
794
        quant_input = self._fake_quant_input(input)
795 796 797 798 799 800

        weight = self.weight
        if self._weight_preprocess is not None:
            weight = self._weight_preprocess(self.weight)
        quant_weight = self._fake_quant_weight(weight)

801 802 803
        out = F.linear(
            x=quant_input, weight=quant_weight, bias=self.bias, name=self.name
        )
H
huangxu96 已提交
804
        return out
805 806


807
class QuantizedColumnParallelLinear(Layer):
808 809 810 811 812 813 814 815 816 817 818 819 820
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
821
        super().__init__()
822
        '''
823

824
        '''
825 826 827 828 829 830
        assert (
            weight_quant_layer is None
        ), "When quantizing ColumnParallelLinear, weight_quant_layer should be None."
        assert (
            act_quant_layer is None
        ), "When quantizing ColumnParallelLinear, act_quant_layer should be None."
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851

        self.weight = getattr(layer, 'weight')
        self.bias = getattr(layer, 'bias')
        self.name = getattr(layer, '_name')
        # For FakeQuant
        self._linear_quant_axis = 1

        self.is_mp = getattr(layer, 'is_mp')
        self.model_parallel_group = getattr(layer, 'model_parallel_group')
        self.gather_output = getattr(layer, 'gather_output')

        self._fake_quant_weight = _get_fake_quant_type(
            weight_quantize_type,
            name=self.weight.name,
            moving_rate=moving_rate,
            quant_bits=weight_bits,
            dtype=self._dtype,
            quant_on_weight=True,
            channel_num=self.weight.shape[self._linear_quant_axis],
            quant_axis=self._linear_quant_axis,
            reduce_type='max'
852 853 854
            if paddle.distributed.get_world_size() > 1
            else None,
        )
855 856 857 858 859 860 861 862

        self._fake_quant_input = _get_fake_quant_type(
            activation_quantize_type,
            name=layer.full_name(),
            moving_rate=moving_rate,
            quant_bits=activation_bits,
            dtype=self._dtype,
            quant_on_weight=False,
863 864
            reduce_type=None,
        )
865

866 867 868 869 870 871
        self._act_preprocess = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._weight_preprocess = (
            weight_pre_layer() if weight_pre_layer is not None else None
        )
872 873 874 875

    def forward(self, input):
        if self.is_mp:
            input_parallel = paddle.distributed.collective._c_identity(
876 877
                input, group=self.model_parallel_group
            )
878 879 880 881 882 883 884 885 886 887 888 889
        else:
            input_parallel = input

        if self._act_preprocess is not None:
            input_parallel = self._act_preprocess(input_parallel)
        quant_input = self._fake_quant_input(input_parallel)

        weight = self.weight
        if self._weight_preprocess is not None:
            weight = self._weight_preprocess(self.weight)
        quant_weight = self._fake_quant_weight(weight)

890 891 892
        output_parallel = F.linear(
            x=quant_input, weight=quant_weight, bias=self.bias, name=self.name
        )
893 894 895

        if self.gather_output and self.is_mp:
            output = paddle.distributed.collective._c_concat(
896 897
                output_parallel, group=self.model_parallel_group
            )
898 899 900 901 902 903
        else:
            output = output_parallel
        return output


class QuantizedRowParallelLinear(Layer):
904 905 906 907 908 909 910 911 912 913 914 915 916
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
917
        super().__init__()
918 919 920 921 922 923
        assert (
            weight_quant_layer is None
        ), "When quantizing RowParallelLinear, weight_quant_layer cannot defined by yourself."
        assert (
            act_quant_layer is None
        ), "When quantizing RowParallelLinear, act_quant_layer cannot defined by yourself."
924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945

        # For Linear
        self.weight = getattr(layer, 'weight')
        self.bias = getattr(layer, 'bias')
        self.name = getattr(layer, '_name')
        # For FakeQuant
        self._linear_quant_axis = 1

        self.input_is_parallel = getattr(layer, 'input_is_parallel')
        self.is_mp = getattr(layer, 'is_mp')
        self.model_parallel_group = getattr(layer, 'model_parallel_group')

        self._fake_quant_weight = _get_fake_quant_type(
            weight_quantize_type,
            name=self.weight.name,
            moving_rate=moving_rate,
            quant_bits=weight_bits,
            dtype=self._dtype,
            quant_on_weight=True,
            channel_num=self.weight.shape[self._linear_quant_axis],
            quant_axis=self._linear_quant_axis,
            reduce_type='max'
946 947 948
            if paddle.distributed.get_world_size() > 1
            else None,
        )
949 950 951 952 953 954 955 956 957

        self._fake_quant_input = _get_fake_quant_type(
            activation_quantize_type,
            name=layer.full_name(),
            moving_rate=moving_rate,
            quant_bits=activation_bits,
            dtype=self._dtype,
            quant_on_weight=False,
            reduce_type='max'
958 959 960
            if paddle.distributed.get_world_size() > 1
            else None,
        )
961

962 963 964 965 966 967
        self._act_preprocess = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._weight_preprocess = (
            weight_pre_layer() if weight_pre_layer is not None else None
        )
968 969 970 971 972 973 974

    def forward(self, input):
        if self.input_is_parallel or (not self.is_mp):
            input_parallel = input
        else:
            # split last dim
            input_parallel = paddle.distributed.collective._c_split(
975 976
                input, group=self.model_parallel_group
            )
977 978 979 980 981 982 983 984 985 986

        if self._act_preprocess is not None:
            input_parallel = self._act_preprocess(input_parallel)
        quant_input = self._fake_quant_input(input_parallel)

        weight = self.weight
        if self._weight_preprocess is not None:
            weight = self._weight_preprocess(self.weight)
        quant_weight = self._fake_quant_weight(weight)

987 988 989
        output_parallel = F.linear(
            x=quant_input, weight=quant_weight, name=self.name
        )
990 991 992 993 994
        if self.is_mp:
            output_ = paddle.distributed.collective._mp_allreduce(
                output_parallel,
                group=self.model_parallel_group,
                use_calc_stream=True,
995 996
                use_model_parallel=True,
            )
997 998 999 1000 1001 1002
        else:
            output_ = output_parallel
        output = output_ + self.bias if self.bias is not None else output_
        return output


C
Chang Xu 已提交
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
class QuantizedMatmul(Layer):
    """
    The computational logic of QuantizedMatmul is the same with Matmul.
    The only difference is that its inputs are all fake quantized.
    """

    def __init__(
        self,
        layer=None,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        weight_quantize_type='abs_max',
        activation_quantize_type='abs_max',
        weight_pre_layer=None,
        act_pre_layer=None,
        weight_quant_layer=None,
        act_quant_layer=None,
    ):
        super().__init__()

        # For FakeQuant
        if act_quant_layer is not None:
            self._fake_quant_x = act_quant_layer()
            self._fake_quant_y = act_quant_layer()
        else:
            self._fake_quant_x = _get_fake_quant_type(
                activation_quantize_type,
                moving_rate=moving_rate,
                quant_bits=activation_bits,
                quant_on_weight=False,
            )
            self._fake_quant_y = _get_fake_quant_type(
                activation_quantize_type,
                moving_rate=moving_rate,
                quant_bits=activation_bits,
                quant_on_weight=False,
            )

        self._act_preprocess_x = (
            act_pre_layer() if act_pre_layer is not None else None
        )
        self._act_preprocess_y = (
            act_pre_layer() if act_pre_layer is not None else None
        )

    def forward(self, x, y, transpose_x=False, transpose_y=False, name=None):
        if self._act_preprocess_x is not None:
            x = self._act_preprocess_x(x)
        quant_x = self._fake_quant_x(x)

        if self._act_preprocess_y is not None:
            y = self._act_preprocess_y(y)
        quant_y = self._fake_quant_y(y)

        out = paddle.matmul(quant_x, quant_y, transpose_x, transpose_y, name)
        return out


Z
zhiboniu 已提交
1062
class MAOutputScaleLayer(Layer):
1063 1064
    """
    Add MovingAverageMaxScale layer to the behind of the input layer.
1065
    Calculate the scale (moving average abs max) for the output of the input layer.
1066 1067
    """

1068 1069 1070 1071 1072 1073 1074 1075
    def __init__(
        self,
        layer=None,
        moving_rate=0.9,
        name=None,
        dtype='float32',
        reduce_type=None,
    ):
1076
        r"""
1077
        Construct
1078
        """
1079
        super().__init__()
1080
        self._layer = layer
1081 1082
        if name is None:
            name = layer.full_name()
1083 1084 1085
        self._ma_output_scale = MovingAverageAbsMaxScale(
            name, moving_rate, dtype, reduce_type
        )
1086 1087 1088 1089

    def forward(self, *inputs, **kwargs):
        out = self._layer(*inputs, **kwargs)
        # TODO (jc): support the ops of several outputs
1090 1091 1092 1093 1094
        if (
            isinstance(out, list)
            or isinstance(out, tuple)
            or isinstance(out, dict)
        ):
1095 1096 1097
            return out
        else:
            return self._ma_output_scale(out)
1098

1099

Z
zhiboniu 已提交
1100
class FakeQuantMAOutputScaleLayer(Layer):
1101 1102 1103 1104
    """
    Add FakeQuantMovingAverageAbsMax layer to the behind of the input layer.
    """

1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
    def __init__(
        self,
        layer,
        weight_bits=8,
        activation_bits=8,
        moving_rate=0.9,
        name=None,
        reduce_type=None,
        *args,
        **kwargs
    ):
1116

1117
        super().__init__()
1118 1119 1120 1121 1122 1123 1124
        self._layer = layer
        self._fake_quant_output = _get_fake_quant_type(
            'moving_average_abs_max',
            name=layer.full_name() if name is None else name,
            moving_rate=moving_rate,
            quant_bits=activation_bits,
            dtype=self._dtype,
1125
            quant_on_weight=False,
1126 1127
            reduce_type=reduce_type,
        )
1128 1129 1130 1131 1132 1133 1134 1135

    def forward(self, *inputs, **kwargs):
        out = self._layer(*inputs, **kwargs)
        # TODO (jc): support the ops of several outputs
        if (isinstance(out, list) or isinstance(out, tuple)) and len(out) > 1:
            return out
        else:
            return self._fake_quant_output(out)
1136 1137 1138 1139 1140 1141


def _get_fake_quant_type(quant_type, **kwargs):
    call_args = {
        "name": kwargs.get("name", None),
        "quant_bits": kwargs.get("quant_bits", 8),
1142
        "dtype": kwargs.get("dtype", "float32"),
1143
        "reduce_type": kwargs.get("reduce_type", None),
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
    }

    if quant_type == 'abs_max':
        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
    elif quant_type == 'moving_average_abs_max':
        call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
    elif quant_type == 'channel_wise_abs_max':
        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
        call_args["channel_num"] = kwargs.get("channel_num", None)
        call_args["quant_axis"] = kwargs.get("quant_axis", 0)
        assert call_args["channel_num"] is not None, (
            "You need to input channel_num"
1156 1157
            "when you use channel_wise_abs_max strategy."
        )
C
Chang Xu 已提交
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
    elif quant_type == 'lsq_weight':
        call_args["all_postive"] = kwargs.get("all_postive", False)
        call_args["per_channel"] = False
        call_args["channel_num"] = 1
        call_args["quant_linear"] = kwargs.get("quant_linear", False)
    elif quant_type == 'channel_wise_lsq_weight':
        quant_type = 'lsq_weight'
        call_args["all_postive"] = kwargs.get("all_postive", False)
        call_args["per_channel"] = True
        call_args["channel_num"] = kwargs.get("channel_num", None)
        call_args["quant_linear"] = kwargs.get("quant_linear", False)
        assert call_args["channel_num"] is not None, (
            "You need to input channel_num"
1171 1172
            "when you use channel_wise_abs_max strategy."
        )
C
Chang Xu 已提交
1173 1174 1175
    elif quant_type == 'lsq_act':
        call_args["all_postive"] = kwargs.get("all_postive", False)
        call_args["symmetric"] = kwargs.get("symmetric", True)
1176 1177 1178
    fake_quant_map = {
        'abs_max': FakeQuantAbsMax,
        'moving_average_abs_max': FakeQuantMovingAverageAbsMax,
C
Chang Xu 已提交
1179 1180
        'channel_wise_abs_max': FakeQuantChannelWiseAbsMax,
        'lsq_weight': FakeQuantWeightLSQPlus,
1181
        'lsq_act': FakeQuantActLSQPlus,
1182 1183 1184
    }

    return fake_quant_map[quant_type](**call_args)