clip.py 35.5 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
F
fengjiayi 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
F
fengjiayi 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
F
update  
fengjiayi 已提交
14

F
fengjiayi 已提交
15
import copy
16
import warnings
F
fengjiayi 已提交
17

Y
Yu Yang 已提交
18
import functools
W
WangXi 已提交
19
import paddle
20 21
from . import layers
from . import framework
F
fengjiayi 已提交
22
from . import core
C
Chengmo 已提交
23
from . import name_scope
24
from .dygraph import base as imperative_base
W
WangXi 已提交
25
from .data_feeder import check_variable_and_dtype
26
from .framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
W
WangXi 已提交
27
from .layer_helper import LayerHelper
28
from .framework import default_main_program
29
from paddle import _C_ops, _legacy_C_ops
Y
Yu Yang 已提交
30

F
fengjiayi 已提交
31
__all__ = [
32 33 34 35 36
    'set_gradient_clip',
    'ErrorClipByValue',
    'ClipGradByValue',
    'ClipGradByNorm',
    'ClipGradByGlobalNorm',
F
fengjiayi 已提交
37
]
Y
Yu Yang 已提交
38

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
_clip_by_global_norm_using_mp_type_flag = False


def _clip_by_global_norm_using_mp_type(*args):
    global _clip_by_global_norm_using_mp_type_flag
    assert len(args) <= 1
    if len(args) == 1:
        assert isinstance(args[0], bool)
        old_value = _clip_by_global_norm_using_mp_type_flag
        _clip_by_global_norm_using_mp_type_flag = args[0]
        return old_value
    else:
        return _clip_by_global_norm_using_mp_type_flag


def _cast_to_mp_type_if_enabled(x):
55 56 57 58
    if (
        x.dtype == core.VarDesc.VarType.FP16
        or x.dtype == core.VarDesc.VarType.BF16
    ) and _clip_by_global_norm_using_mp_type():
59 60 61 62
        return x.astype(core.VarDesc.VarType.FP32)
    else:
        return x

Y
Yu Yang 已提交
63

W
WangXi 已提交
64 65 66 67 68
def _squared_l2_norm(x):
    r"""
    This OP returns the squared L2 norm of a tensor.
    """

69
    x = _cast_to_mp_type_if_enabled(x)
70 71 72 73 74
    if (
        core.is_compiled_with_xpu()
        or x.dtype == core.VarDesc.VarType.FP16
        or x.dtype == core.VarDesc.VarType.BF16
    ):
75
        square = paddle.square(x)
76
        sum_square = paddle.sum(square)
W
WangXi 已提交
77 78
        return sum_square

79
    if in_dygraph_mode():
80
        return _C_ops.squared_l2_norm(x)
81 82
    elif _in_legacy_dygraph():
        return _legacy_C_ops.squared_l2_norm(x)
W
WangXi 已提交
83 84

    op_type = 'squared_l2_norm'
85
    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
W
WangXi 已提交
86 87 88 89 90 91 92 93 94
    helper = LayerHelper(op_type, **locals())
    out = helper.create_variable_for_type_inference(x.dtype)

    inputs = {"X": x}
    outputs = {'Out': out}
    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
    return out


95
class BaseErrorClipAttr:
F
fengjiayi 已提交
96 97 98
    def __str__(self):
        raise NotImplementedError()

Y
yuyang18 已提交
99
    def _append_clip_op(self, block, grad_name):
F
fengjiayi 已提交
100 101 102 103
        raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
104
    r"""
105 106
    Clips tensor values to the range [min, max].

107 108
    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.
109 110 111 112 113 114 115

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
116
        will be set to ``-max`` by framework.
117 118 119 120

    Examples:
        .. code-block:: python

121
            import paddle.fluid as fluid
2
201716010711 已提交
122 123
            import paddle
            paddle.enable_static()
124 125 126 127 128
            BATCH_SIZE = 128
            CLIP_MAX = 2e-6
            CLIP_MIN = -1e-6
            prog = fluid.framework.Program()
            with fluid.program_guard(main_program=prog):
C
Chengmo 已提交
129 130
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
131 132
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
133 134
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
135
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
136
                cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False)
2
201716010711 已提交
137
                avg_cost = paddle.mean(cost)
138 139 140
            prog_clip = prog.clone()
            prog_clip.block(0).var(hidden1.name)._set_error_clip(
                fluid.clip.ErrorClipByValue(
141 142 143
                    max=CLIP_MAX, min=CLIP_MIN
                )
            )
144 145
    """

F
fengjiayi 已提交
146 147 148 149 150 151 152 153 154
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
            min = -max
        else:
            min = float(min)
        self.max = max
        self.min = min

F
fengjiayi 已提交
155 156 157
    def __str__(self):
        return "ByValue, min=%f, max=%f" % (self.min, self.max)

Y
yuyang18 已提交
158
    def _append_clip_op(self, block, grad_name):
159 160 161 162
        clip_op_desc = block.desc.append_op()
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
W
Wu Yi 已提交
163 164
        clip_op_desc._set_attr("min", self.min)
        clip_op_desc._set_attr("max", self.max)
F
fengjiayi 已提交
165 166 167 168 169 170


def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
171
    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
W
Wu Yi 已提交
172
        fwd_var = block._var_recursive(grad_to_var[grad_n])
F
fengjiayi 已提交
173
        error_clip = getattr(fwd_var, "error_clip", None)
174 175 176
        if not (
            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
        ):
F
fengjiayi 已提交
177 178 179
            raise TypeError(
                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
            )
F
fengjiayi 已提交
180
        if error_clip is not None:
Y
yuyang18 已提交
181
            error_clip._append_clip_op(block, grad_n)
F
fengjiayi 已提交
182 183


184
class ClipGradBase:
185
    def __init__(self):
186
        super().__init__()
187

F
fengjiayi 已提交
188 189 190
    def __str__(self):
        raise NotImplementedError()

191
    @imperative_base.no_grad
192 193
    def _dygraph_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
194

195 196
    def _static_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
197

198
    def __call__(self, params_grads):
J
Jiabin Yang 已提交
199
        if framework._non_static_mode():
200 201 202 203 204 205
            return self._dygraph_clip(params_grads)
        else:
            for p, g in params_grads:
                if getattr(p, 'gradient_clip_attr', None) is not None:
                    warnings.warn(
                        "'set_gradient_clip' will be ineffective, because you have "
206
                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
207 208
                        "is redundant and you can remove it."
                    )
209 210
                    break
            return self._static_clip(params_grads)
F
fengjiayi 已提交
211

Y
yuyang18 已提交
212
    def _process_context(self, context, param, grad):
213
        raise NotImplementedError()
Y
Yu Yang 已提交
214

Y
yuyang18 已提交
215
    def _create_operators(self, param, grad):
216
        raise NotImplementedError()
Y
Yu Yang 已提交
217 218


219
class ClipGradByValue(ClipGradBase):
220
    """
221
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
222

223
    - Any values less than min are set to ``min``.
224

225
    - Any values greater than max are set to ``max``.
226

227
    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
228
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
229 230

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
231
    (for example: :ref:`api_paddle_optimizer_SGD`).
232 233

    Note:
234
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
235
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
236

237 238
    Args:
        max (float): The maximum value to clip by.
239
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
240
            automatically. In this case, ``max`` must be greater than 0.
241 242 243

    Examples:
        .. code-block:: python
244

245
            import paddle
246

247
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
248 249
            linear = paddle.nn.Linear(in_features=10, out_features=10,
                                      weight_attr=paddle.ParamAttr(need_clip=True),
250
                                      bias_attr=paddle.ParamAttr(need_clip=False))
251 252 253 254
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

255
            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
256 257
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
258 259
    """

260
    def __init__(self, max, min=None):
261
        super().__init__()
Y
Yu Yang 已提交
262
        if min is None:
263
            assert max > 0.0
Y
Yu Yang 已提交
264
            min = -max
265 266
        self.max = float(max)
        self.min = float(min)
Y
Yu Yang 已提交
267

F
fengjiayi 已提交
268
    def __str__(self):
269
        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
270

271
    @imperative_base.no_grad
272 273 274 275 276
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
277
            if getattr(p, 'need_clip', True) is False:
278 279
                params_and_grads.append((p, g))
                continue
H
hong 已提交
280
            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
281 282 283 284 285
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
286
        param_new_grad_name_dict = dict()
287 288 289 290
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
291
                if getattr(p, 'need_clip', True) is False:
292 293 294 295 296 297
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
                params_and_grads.append((p, new_grad))
298 299
                param_new_grad_name_dict[p.name] = new_grad.name
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
300
        return params_and_grads
F
fengjiayi 已提交
301

Y
yuyang18 已提交
302
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
303 304
        pass

Y
yuyang18 已提交
305
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
306 307 308 309
        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
        return param, new_grad


310
class ClipGradByNorm(ClipGradBase):
311
    r"""
312
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
313

314
    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
315

316
    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
317

318 319
    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
320 321

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
322
    (for example: :ref:`api_paddle_optimizer_SGD`).
323

324
    The clipping formula is:
325 326

    .. math::
327
        Out =
328 329 330 331 332 333
        \left\{
            \begin{array}{ccl}
                X & & if (norm(X) \leq clip\_norm) \\
                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
        \end{array}
        \right.
334 335 336 337


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

338
    .. math::
339
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
340

341
    Note:
342
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
343 344
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

345
    Args:
346
        clip_norm(float): The maximum norm value.
C
Chengmo 已提交
347

348 349
    Examples:
        .. code-block:: python
350

351
            import paddle
352

353
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
354 355
            linear = paddle.nn.Linear(in_features=10, out_features=10,
                                      weight_attr=paddle.ParamAttr(need_clip=True),
356
                                      bias_attr=paddle.ParamAttr(need_clip=False))
357 358 359 360
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

361
            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
362 363
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
364 365
    """

366
    def __init__(self, clip_norm):
367
        super().__init__()
368
        self.clip_norm = float(clip_norm)
F
fengjiayi 已提交
369

F
fengjiayi 已提交
370
    def __str__(self):
371 372
        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm

373
    @imperative_base.no_grad
374 375 376 377 378
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
379
            if getattr(p, 'need_clip', True) is False:
380 381 382 383 384 385 386 387 388
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        with framework.name_scope('gradient_clip'):
389
            param_new_grad_name_dict = dict()
390 391 392
            for p, g in params_grads:
                if g is None:
                    continue
393
                if getattr(p, 'need_clip', True) is False:
394 395 396 397 398
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
399
                param_new_grad_name_dict[p.name] = new_grad.name
400
                params_and_grads.append((p, new_grad))
401
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
402
        return params_and_grads
F
fengjiayi 已提交
403

Y
yuyang18 已提交
404
    def _process_context(self, context, param, grad):
F
fengjiayi 已提交
405 406
        pass

Y
yuyang18 已提交
407
    def _create_operators(self, param, grad):
F
fengjiayi 已提交
408 409 410 411
        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
        return param, new_grad


412 413 414 415 416 417 418 419 420 421 422 423 424 425
_allow_pure_fp16_global_norm_clip_flag = False


def _allow_pure_fp16_global_norm_clip(*args):
    global _allow_pure_fp16_global_norm_clip_flag
    if len(args) == 0:
        return _allow_pure_fp16_global_norm_clip_flag
    else:
        assert len(args) == 1 and isinstance(args[0], bool)
        old_value = _allow_pure_fp16_global_norm_clip_flag
        _allow_pure_fp16_global_norm_clip_flag = args[0]
        return old_value


426
class ClipGradByGlobalNorm(ClipGradBase):
427
    r"""
428
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
429
    :math:`t\_list` , and limit it to ``clip_norm`` .
430

431
    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
432

433
    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
434

435 436
    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
437 438

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
439
    (for example: :ref:`api_paddle_optimizer_SGD`).
440 441

    The clipping formula is:
442 443 444

    .. math::

445
        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
446 447 448 449 450 451 452

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

453
    Note:
454
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
455 456
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

457
    Args:
458
        clip_norm (float): The maximum norm value.
459
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
460 461 462

    Examples:
        .. code-block:: python
463

464 465
            import paddle

466
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
467 468
            linear = paddle.nn.Linear(in_features=10, out_features=10,
                                      weight_attr=paddle.ParamAttr(need_clip=True),
469
                                      bias_attr=paddle.ParamAttr(need_clip=False))
470 471 472 473
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

474
            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
475 476
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
477 478
    """

479 480 481
    def __init__(
        self, clip_norm, group_name="default_group", auto_skip_clip=False
    ):
482
        super().__init__()
483
        self.clip_norm = float(clip_norm)
F
update  
fengjiayi 已提交
484
        self.group_name = group_name
485 486
        assert isinstance(auto_skip_clip, bool)
        self.auto_skip_clip = auto_skip_clip
487

F
fengjiayi 已提交
488
    def __str__(self):
489 490
        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)

491
    @imperative_base.no_grad
492 493 494
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
495 496
        sum_square_list_fp16 = []
        sum_square_list_fp32 = []
497 498 499
        for p, g in params_grads:
            if g is None:
                continue
500
            if getattr(p, 'need_clip', True) is False:
501 502
                continue
            merge_grad = g
503 504 505 506 507 508

            if in_dygraph_mode() and g.is_selected_rows():
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = merge_grad._get_tensor_from_selected_rows()

            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
509 510
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
W
WangXi 已提交
511 512

            sum_square = _squared_l2_norm(merge_grad)
513 514 515 516
            if (
                sum_square.dtype == core.VarDesc.VarType.FP16
                or sum_square.dtype == core.VarDesc.VarType.BF16
            ):
517 518 519 520 521
                sum_square_list_fp16.append(sum_square)
            elif sum_square.dtype == core.VarDesc.VarType.FP32:
                sum_square_list_fp32.append(sum_square)
            else:
                sum_square_list.append(sum_square)
522 523

        # all parameters have been filterd out
524 525 526 527 528 529
        if (
            len(sum_square_list)
            + len(sum_square_list_fp16)
            + len(sum_square_list_fp32)
            == 0
        ):
530 531
            return params_grads

532 533 534
        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
        global_norm_var = []
        if len(sum_square_list_fp16) > 0:
Z
zhangbo9674 已提交
535
            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
536 537
            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
        if len(sum_square_list_fp32) > 0:
Z
zhangbo9674 已提交
538
            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
539 540 541 542 543
            if sum_dtype == 'float32':
                global_norm_var.append(global_norm_var_fp32)
            else:
                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
        if len(sum_square_list) > 0:
Z
zhangbo9674 已提交
544
            global_norm_var_fp64 = paddle.add_n(sum_square_list)
545
            global_norm_var.append(global_norm_var_fp64)
Z
zhangbo9674 已提交
546
        global_norm_var = paddle.add_n(global_norm_var)
547
        global_norm_var = paddle.sqrt(global_norm_var)
548 549 550
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )
Z
zhangbo9674 已提交
551 552

        need_clip = False
553 554
        if not self.auto_skip_clip:  # always apply clip
            need_clip = True
555
            clip_var = paddle.divide(
556
                x=max_global_norm,
H
HongyuJia 已提交
557
                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
558
            )
559 560
        elif global_norm_var > max_global_norm:
            # only when global_norm_var > max_global_norm, grad need clip
Z
zhangbo9674 已提交
561
            need_clip = True
562
            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
563

564 565 566
        for p, g in params_grads:
            if g is None:
                continue
567
            if getattr(p, 'need_clip', True) is False:
568 569
                params_and_grads.append((p, g))
                continue
W
WangXi 已提交
570
            # TODO(wangxi): use inplace elementwise_mul
Z
zhangbo9674 已提交
571
            if need_clip:
572 573 574 575 576
                clip_input = (
                    clip_var.astype(g.dtype)
                    if clip_var.dtype != g.dtype
                    else clip_var
                )
577
                new_grad = paddle.multiply(g, clip_input)
Z
zhangbo9674 已提交
578 579 580
                params_and_grads.append((p, new_grad))
            else:
                params_and_grads.append((p, g))
581 582 583 584 585 586

        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
587 588
        sum_square_list_fp16 = []
        sum_square_list_fp32 = []
589 590 591 592
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
593
                if getattr(p, 'need_clip', True) is False:
594 595 596 597 598 599
                    continue
                merge_grad = g
                with p.block.program._optimized_guard([p, g]):
                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                        merge_grad = layers.merge_selected_rows(g)
                        merge_grad = layers.get_tensor_from_selected_rows(
600 601
                            merge_grad
                        )
W
WangXi 已提交
602
                    sum_square = _squared_l2_norm(merge_grad)
603 604 605 606 607 608
                    if sum_square.dtype == core.VarDesc.VarType.FP16:
                        sum_square_list_fp16.append(sum_square)
                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
                        sum_square_list_fp32.append(sum_square)
                    else:
                        sum_square_list.append(sum_square)
609 610

            # all parameters have been filterd out
611 612 613 614 615 616
            if (
                len(sum_square_list)
                + len(sum_square_list_fp16)
                + len(sum_square_list_fp32)
                == 0
            ):
617 618 619
                return params_grads

            with p.block.program._optimized_guard([p, g]):
620 621 622 623 624
                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"

                global_norm_var = []
                if len(sum_square_list_fp16) > 0:
                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
625 626 627 628
                    if (
                        sum_square_list_fp32
                        or sum_square_list
                        or not _allow_pure_fp16_global_norm_clip()
629 630
                    ):
                        global_norm_var.append(
631 632
                            global_norm_var_fp16.astype(sum_dtype)
                        )
633 634
                    else:
                        global_norm_var.append(global_norm_var_fp16)
635 636 637 638 639 640
                if len(sum_square_list_fp32) > 0:
                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
                    if sum_dtype == 'float32':
                        global_norm_var.append(global_norm_var_fp32)
                    else:
                        global_norm_var.append(
641 642
                            global_norm_var_fp32.astype(sum_dtype)
                        )
643 644 645 646
                if len(sum_square_list) > 0:
                    # fp64
                    global_norm_var_other_dtype = layers.sums(sum_square_list)
                    global_norm_var.append(global_norm_var_other_dtype)
647

648 649 650 651 652
                global_norm_var = (
                    layers.sums(global_norm_var)
                    if len(global_norm_var) > 1
                    else global_norm_var[0]
                )
653
                global_norm_var = paddle.sqrt(x=global_norm_var)
654
                max_global_norm = layers.fill_constant(
655 656
                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
                )
657
                scale_var = paddle.divide(
658
                    x=max_global_norm,
H
HongyuJia 已提交
659
                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
660
                )
661
            param_new_grad_name_dict = dict()
662 663 664
            for p, g in params_grads:
                if g is None:
                    continue
665
                if getattr(p, 'need_clip', True) is False:
666 667 668 669
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
670
                    new_g = _cast_to_mp_type_if_enabled(g)
W
WangXi 已提交
671
                    # inplace
672 673 674 675 676 677
                    scale_input = (
                        scale_var.astype('float16')
                        if new_g.dtype == core.VarDesc.VarType.FP16
                        and scale_var.dtype != core.VarDesc.VarType.FP16
                        else scale_var
                    )
678 679 680 681 682
                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
                    # will be in different blocks with the gradient clip related ops.
                    # We need to handle the correct block, otherwise will encounter
                    # a 'NotFoundError' during compile time.
                    block = default_main_program().current_block()
683 684 685 686 687
                    block.append_op(
                        type='elementwise_mul',
                        inputs={'X': new_g, 'Y': scale_input},
                        outputs={'Out': new_g},
                    )
688
                    if new_g is not g:
689 690 691 692 693 694 695 696 697
                        block.append_op(
                            type='cast',
                            inputs={'X': new_g},
                            outputs={'Out': g},
                            attrs={
                                'in_dtype': new_g.dtype,
                                'out_dtype': g.dtype,
                            },
                        )
698

W
WangXi 已提交
699 700
                param_new_grad_name_dict[p.name] = g.name
                params_and_grads.append((p, g))
701

702
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
703
        return params_and_grads
F
fengjiayi 已提交
704

Y
yuyang18 已提交
705
    def _process_context(self, context, param, grad):
F
update  
fengjiayi 已提交
706 707 708 709
        if self.group_name not in context:
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
710 711
                shape=[1], dtype=grad.dtype, value=self.clip_norm
            )
F
update  
fengjiayi 已提交
712 713 714 715 716
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
F
fengjiayi 已提交
717

C
chengduo 已提交
718 719 720 721 722
        merge_grad = grad
        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
            merge_grad = layers.merge_selected_rows(grad)
            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)

W
WangXi 已提交
723
        local_norm_var = _squared_l2_norm(merge_grad)
F
update  
fengjiayi 已提交
724
        context[self.group_name].append(local_norm_var)
F
fengjiayi 已提交
725

F
update  
fengjiayi 已提交
726
        self.context = context
727

Y
yuyang18 已提交
728
    def _create_operators(self, param, grad):
F
update  
fengjiayi 已提交
729 730 731
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
732
            group_norm_var = paddle.sqrt(x=group_norm_var)
F
update  
fengjiayi 已提交
733
            clip_var = self.context[self.group_name + "_clip"]
734
            group_scale_var = paddle.divide(
735
                x=clip_var,
H
HongyuJia 已提交
736
                y=paddle.maximum(x=clip_var, y=group_norm_var),
737 738
            )
            assert group_scale_var.shape == (1,)
F
update  
fengjiayi 已提交
739
            self.context[group_scale_name] = group_scale_var
F
fengjiayi 已提交
740

W
WangXi 已提交
741
        # inplace
742 743 744 745 746
        param.block.append_op(
            type='elementwise_mul',
            inputs={'X': grad, 'Y': self.context[group_scale_name]},
            outputs={'Out': grad},
        )
C
chengduo 已提交
747

W
WangXi 已提交
748
        return param, grad
F
fengjiayi 已提交
749 750


751
@framework.dygraph_not_support
F
fengjiayi 已提交
752
def set_gradient_clip(clip, param_list=None, program=None):
F
fengjiayi 已提交
753
    """
754
    :api_attr: Static Graph
755

756
    Warning:
757 758 759

        This API must be used after building network, and before ``minimize`` ,
        and it may be removed in future releases, so it is not recommended.
760 761
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
762
         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
763
         :ref:`api_fluid_clip_GradientClipByValue` .
764

765 766 767
    To specify parameters that require gradient clip.

    Args:
768 769 770 771
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
772
            gradient clipping.
Z
Zeng Jinle 已提交
773
        param_list (list(Variable), optional): Parameters that require gradient clip.
774
                It can be a list of parameter or a list of parameter's name.
775
                Default None, meaning that all parameters in the program will be included.
Z
Zeng Jinle 已提交
776
        program (Program, optional): The program where parameters are located.
777 778 779 780 781 782 783
                Default None, meaning that using :ref:`api_fluid_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python
C
Chengmo 已提交
784

785 786 787
            import paddle.fluid as fluid

            def network():
C
Chengmo 已提交
788 789
                image = fluid.data(name='image', shape=[
                                   None, 28], dtype='float32')
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
                param_attr1 = fluid.ParamAttr("fc1_param")
                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
                param_attr2 = fluid.ParamAttr("fc2_param")
                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
                loss = fluid.layers.reduce_mean(fc2)
                return loss


            # network 1: clip all parameter gradient
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 2: clip parameter gradient by name
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=["fc1_param", "fc2_param"])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

815
            # network 3: clip parameter gradient by value
816 817 818 819 820 821 822 823 824
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
825

826
            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
827 828 829 830 831 832 833
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
                # Set the gradient clipping strategy: clip1
                fluid.clip.set_gradient_clip(clip1)
                # Set the gradient clipping strategy: clip2
834 835
                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
                sgd.minimize(loss)
836
                # 'set_gradient_clip' will not take effect when setting has a conflict,
837
                # and the gradient clipping strategy will be 'clip2'
838 839


F
fengjiayi 已提交
840
    """
841 842 843 844 845 846 847 848
    warnings.warn(
        "Caution! 'set_gradient_clip' is not recommended "
        "and may be deprecated in future! "
        "We recommend a new strategy: set 'grad_clip' "
        "when initializing the 'optimizer'. "
        "This method can reduce the mistakes, please "
        "refer to documention of 'optimizer'."
    )
849

850
    if not isinstance(clip, ClipGradBase):
F
fengjiayi 已提交
851
        raise TypeError(
852 853
            "'clip' should be an instance of ClipGradBase's derived class"
        )
F
fengjiayi 已提交
854 855
    if program is None:
        program = framework.default_main_program()
856 857 858

    for op in program.block(0).ops:
        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
859 860
            "op_namescope"
        ):
861 862 863 864 865 866
            warnings.warn(
                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
            )
            break

F
fengjiayi 已提交
867 868
    if param_list is None:
        param_list = program.block(0).all_parameters()
869
    if all(isinstance(elem, str) for elem in param_list):
F
fengjiayi 已提交
870 871 872 873 874 875 876
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
F
fengjiayi 已提交
877
        param.gradient_clip_attr = copy.deepcopy(clip)
F
fengjiayi 已提交
878 879


880
def append_gradient_clip_ops(param_grads):
Y
Yu Yang 已提交
881
    context = dict()
882 883 884
    for p, g in param_grads:
        if g is None:
            continue
885 886 887
        with p.block.program._optimized_guard([p, g]), framework.name_scope(
            'gradient_clip'
        ):
888
            clip_attr = getattr(p, 'gradient_clip_attr', None)
Y
yuyang18 已提交
889
            if clip_attr is None:
890
                return param_grads
891
            if not isinstance(clip_attr, ClipGradBase):
Y
yuyang18 已提交
892
                raise TypeError(
893 894
                    "clip attribute should be an instance of GradientClipBase"
                )
Y
Yu Yang 已提交
895

Y
yuyang18 已提交
896
            clip_attr._process_context(context=context, param=p, grad=g)
Y
yuyang18 已提交
897 898

    res = []
899
    param_new_grad_name_dict = dict()
900 901 902
    for p, g in param_grads:
        if g is None:
            continue
903 904 905
        with p.block.program._optimized_guard([p, g]), framework.name_scope(
            'gradient_clip'
        ):
906
            param, new_grad = clip_attr._create_operators(param=p, grad=g)
907
            param_new_grad_name_dict[param.name] = new_grad.name
908
            res.append([param, new_grad])
Y
Yu Yang 已提交
909

910
    _correct_clip_op_role_var(res, param_new_grad_name_dict)
911 912 913 914
    return res


# change wrong mapping relation between param & grad in clip op
915
# Note: This function is sensitive to the time cost of the network with gradient clipping
916
# and should not be changed easily. If you must change, please test the time cost.
917 918 919 920
def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
    block_id_list = []
    if len(param_new_grad_name_dict) == 0:
        return
921 922
    for param, grad in params_grads:
        if grad is None:
923
            continue
924 925 926 927
        block_id = param.block.idx
        if block_id in block_id_list:
            continue
        block_id_list.append(block_id)
928
        for op in param.block.program.global_block().ops:
929 930 931 932 933
            if (
                op.has_attr("op_namescope")
                and "gradient_clip" in op.attr("op_namescope")
                and op.attr('op_role_var')
            ):
934 935 936
                param_name = op.attr('op_role_var')[0]
                if param_name in param_new_grad_name_dict:
                    correct_p_g = [
937 938
                        param_name,
                        param_new_grad_name_dict[param_name],
939
                    ]
C
Chengmo 已提交
940
                    op._set_attr('op_role_var', correct_p_g)
Y
Yu Yang 已提交
941 942


943 944 945 946
GradientClipBase = ClipGradBase
GradientClipByValue = ClipGradByValue
GradientClipByNorm = ClipGradByNorm
GradientClipByGlobalNorm = ClipGradByGlobalNorm