clip.py 28.2 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
F
fengjiayi 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
F
fengjiayi 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
F
update  
fengjiayi 已提交
14

15 16
from __future__ import print_function

F
fengjiayi 已提交
17
import copy
18
import six
19
import warnings
F
fengjiayi 已提交
20

Y
Yu Yang 已提交
21
import functools
22 23
from . import layers
from . import framework
F
fengjiayi 已提交
24
from . import core
C
Chengmo 已提交
25
from . import name_scope
26
from .dygraph import base as imperative_base
Y
Yu Yang 已提交
27

F
fengjiayi 已提交
28
__all__ = [
29 30
    'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
    'ClipGradByNorm', 'ClipGradByGlobalNorm'
F
fengjiayi 已提交
31
]
Y
Yu Yang 已提交
32 33


F
fengjiayi 已提交
34
class BaseErrorClipAttr(object):
F
fengjiayi 已提交
35 36 37
    def __str__(self):
        raise NotImplementedError()

Y
yuyang18 已提交
38
    def _append_clip_op(self, block, grad_name):
F
fengjiayi 已提交
39 40 41 42
        raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
43
    r"""
44 45
    Clips tensor values to the range [min, max].

46 47
    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.
48 49 50 51 52 53 54

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
55
        will be set to ``-max`` by framework.
56 57 58 59

    Examples:
        .. code-block:: python

60 61 62 63 64 65
            import paddle.fluid as fluid
            BATCH_SIZE = 128
            CLIP_MAX = 2e-6
            CLIP_MIN = -1e-6
            prog = fluid.framework.Program()
            with fluid.program_guard(main_program=prog):
C
Chengmo 已提交
66 67
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
68 69
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
70 71
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
72 73 74 75 76 77 78
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            prog_clip.block(0).var(hidden1.name)._set_error_clip(
                fluid.clip.ErrorClipByValue(
                    max=CLIP_MAX, min=CLIP_MIN)
79 80
    """

F
fengjiayi 已提交
81 82 83 84 85 86 87 88 89
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
            min = -max
        else:
            min = float(min)
        self.max = max
        self.min = min

F
fengjiayi 已提交
90 91 92
    def __str__(self):
        return "ByValue, min=%f, max=%f" % (self.min, self.max)

Y
yuyang18 已提交
93
    def _append_clip_op(self, block, grad_name):
94 95 96 97
        clip_op_desc = block.desc.append_op()
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
W
Wu Yi 已提交
98 99
        clip_op_desc._set_attr("min", self.min)
        clip_op_desc._set_attr("max", self.max)
F
fengjiayi 已提交
100 101 102 103 104 105


def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
106
    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
W
Wu Yi 已提交
107
        fwd_var = block._var_recursive(grad_to_var[grad_n])
F
fengjiayi 已提交
108
        error_clip = getattr(fwd_var, "error_clip", None)
F
fengjiayi 已提交
109 110 111 112 113
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):
            raise TypeError(
                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
            )
F
fengjiayi 已提交
114
        if error_clip is not None:
Y
yuyang18 已提交
115
            error_clip._append_clip_op(block, grad_n)
F
fengjiayi 已提交
116 117


118 119 120
class ClipGradBase(object):
    def __init__(self):
        super(ClipGradBase, self).__init__()
121

F
fengjiayi 已提交
122 123 124
    def __str__(self):
        raise NotImplementedError()

125
    @imperative_base.no_grad
126 127
    def _dygraph_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
128

129 130
    def _static_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
131

132 133 134 135 136 137 138 139
    def __call__(self, params_grads):
        if framework.in_dygraph_mode():
            return self._dygraph_clip(params_grads)
        else:
            for p, g in params_grads:
                if getattr(p, 'gradient_clip_attr', None) is not None:
                    warnings.warn(
                        "'set_gradient_clip' will be ineffective, because you have "
140
                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
141 142 143
                        "is redundant and you can remove it.")
                    break
            return self._static_clip(params_grads)
F
fengjiayi 已提交
144

Y
yuyang18 已提交
145
    def _process_context(self, context, param, grad):
146
        raise NotImplementedError()
Y
Yu Yang 已提交
147

Y
yuyang18 已提交
148
    def _create_operators(self, param, grad):
149
        raise NotImplementedError()
Y
Yu Yang 已提交
150 151


152
class ClipGradByValue(ClipGradBase):
153
    """
154 155
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
    
156
    - Any values less than min are set to ``min``.
157
    
158
    - Any values greater than max are set to ``max``.
159

160 161
    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
162
    
163
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
164
    (for example: :ref:`api_paddle_optimizer_SGD`).
165 166 167 168

    Note:
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
169
    
170 171
    Args:
        max (float): The maximum value to clip by.
172 173
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
            automatically. In this case, ``max`` must be greater than 0.
174 175 176

    Examples:
        .. code-block:: python
177 178
        
            import paddle
179

180
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
181 182 183
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
184 185 186 187
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

188
            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
189 190
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
191 192
    """

193 194
    def __init__(self, max, min=None):
        super(ClipGradByValue, self).__init__()
Y
Yu Yang 已提交
195
        if min is None:
196
            assert (max > 0.0)
Y
Yu Yang 已提交
197
            min = -max
198 199
        self.max = float(max)
        self.min = float(min)
Y
Yu Yang 已提交
200

F
fengjiayi 已提交
201
    def __str__(self):
202
        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
203

204
    @imperative_base.no_grad
205 206 207 208 209
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
210
            if getattr(p, 'need_clip', True) is False:
211 212 213 214 215 216 217 218
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip(x=g, min=self.min, max=self.max)
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
219
        param_new_grad_name_dict = dict()
220 221 222 223
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
224
                if getattr(p, 'need_clip', True) is False:
225 226 227 228 229 230
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
                params_and_grads.append((p, new_grad))
231 232
                param_new_grad_name_dict[p.name] = new_grad.name
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
233
        return params_and_grads
F
fengjiayi 已提交
234

Y
yuyang18 已提交
235
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
236 237
        pass

Y
yuyang18 已提交
238
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
239 240 241 242
        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
        return param, new_grad


243
class ClipGradByNorm(ClipGradBase):
244
    r"""
245 246 247 248 249 250
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
    
    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
    
    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
    
251 252
    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
253
    
254
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
255
    (for example: :ref:`api_paddle_optimizer_SGD`).
256 257
    
    The clipping formula is:
258 259

    .. math::
260
        Out =
C
Chengmo 已提交
261 262 263 264 265 266
        \\left \{
        \\begin{aligned}
        & X & & if (norm(X) \\leq clip\_norm) \\\\
        & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\
        \\end{aligned}
        \\right.
267 268 269 270


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

271
    .. math::
C
Chengmo 已提交
272
        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
273

274 275 276 277
    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

278
    Args:
279
        clip_norm(float): The maximum norm value.
C
Chengmo 已提交
280

281 282
    Examples:
        .. code-block:: python
283 284
        
            import paddle
285

286
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
287 288 289
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
290 291 292 293
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

294
            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
295 296
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
297 298
    """

299 300
    def __init__(self, clip_norm):
        super(ClipGradByNorm, self).__init__()
301
        self.clip_norm = float(clip_norm)
F
fengjiayi 已提交
302

F
fengjiayi 已提交
303
    def __str__(self):
304 305
        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm

306
    @imperative_base.no_grad
307 308 309 310 311
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
312
            if getattr(p, 'need_clip', True) is False:
313 314 315 316 317 318 319 320 321
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        with framework.name_scope('gradient_clip'):
322
            param_new_grad_name_dict = dict()
323 324 325
            for p, g in params_grads:
                if g is None:
                    continue
326
                if getattr(p, 'need_clip', True) is False:
327 328 329 330 331
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
332
                param_new_grad_name_dict[p.name] = new_grad.name
333
                params_and_grads.append((p, new_grad))
334
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
335
        return params_and_grads
F
fengjiayi 已提交
336

Y
yuyang18 已提交
337
    def _process_context(self, context, param, grad):
F
fengjiayi 已提交
338 339
        pass

Y
yuyang18 已提交
340
    def _create_operators(self, param, grad):
F
fengjiayi 已提交
341 342 343 344
        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
        return param, new_grad


345
class ClipGradByGlobalNorm(ClipGradBase):
346
    r"""
347 348 349 350 351 352 353
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
    :math:`t\_list` , and limit it to ``clip_norm`` .
    
    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
    
    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
    
354 355
    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
356
    
357
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
358
    (for example: :ref:`api_paddle_optimizer_SGD`).
359 360

    The clipping formula is:
361 362 363 364 365 366 367 368 369 370 371

    .. math::

        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

372 373 374 375
    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

376
    Args:
377
        clip_norm (float): The maximum norm value.
378
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
379 380 381

    Examples:
        .. code-block:: python
382
        
383 384
            import paddle

385
            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
386 387 388
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
389 390 391 392
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

393
            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
394 395
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
396 397
    """

398 399
    def __init__(self, clip_norm, group_name="default_group"):
        super(ClipGradByGlobalNorm, self).__init__()
400
        self.clip_norm = float(clip_norm)
F
update  
fengjiayi 已提交
401
        self.group_name = group_name
402

F
fengjiayi 已提交
403
    def __str__(self):
404 405
        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)

406
    @imperative_base.no_grad
407 408 409 410 411 412
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        for p, g in params_grads:
            if g is None:
                continue
413
            if getattr(p, 'need_clip', True) is False:
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
431
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
432 433 434 435 436 437 438
        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(
                x=global_norm_var, y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
439
            if getattr(p, 'need_clip', True) is False:
440 441 442 443 444 445 446 447 448 449 450 451 452 453
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
454
                if getattr(p, 'need_clip', True) is False:
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
                    continue
                merge_grad = g
                with p.block.program._optimized_guard([p, g]):
                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                        merge_grad = layers.merge_selected_rows(g)
                        merge_grad = layers.get_tensor_from_selected_rows(
                            merge_grad)

                    square = layers.square(merge_grad)
                    sum_square = layers.reduce_sum(input=square)
                    sum_square_list.append(sum_square)

            # all parameters have been filterd out
            if len(sum_square_list) == 0:
                return params_grads

            with p.block.program._optimized_guard([p, g]):
                global_norm_var = layers.sums(sum_square_list)
                global_norm_var = layers.sqrt(x=global_norm_var)
                max_global_norm = layers.fill_constant(
475 476 477
                    shape=[1],
                    dtype=global_norm_var.dtype,
                    value=self.clip_norm)
478 479 480 481 482
                scale_var = layers.elementwise_div(
                    x=max_global_norm,
                    y=layers.elementwise_max(
                        x=max_global_norm, y=global_norm_var))

483
            param_new_grad_name_dict = dict()
484 485 486
            for p, g in params_grads:
                if g is None:
                    continue
487
                if getattr(p, 'need_clip', True) is False:
488 489 490 491
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
L
lilong12 已提交
492 493 494 495 496 497 498 499
                    p.block.append_op(
                        type='elementwise_mul',
                        inputs={'X': g,
                                'Y': scale_var},
                        outputs={'Out': g})

                param_new_grad_name_dict[p.name] = p.name
                params_and_grads.append((p, p))
500

501
        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
502
        return params_and_grads
F
fengjiayi 已提交
503

Y
yuyang18 已提交
504
    def _process_context(self, context, param, grad):
F
update  
fengjiayi 已提交
505 506 507 508
        if self.group_name not in context:
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
509
                shape=[1], dtype=grad.dtype, value=self.clip_norm)
F
update  
fengjiayi 已提交
510 511 512 513 514
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
F
fengjiayi 已提交
515

C
chengduo 已提交
516 517 518 519 520 521
        merge_grad = grad
        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
            merge_grad = layers.merge_selected_rows(grad)
            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)

        square = layers.square(merge_grad)
P
phlrain 已提交
522
        local_norm_var = layers.reduce_sum(input=square)
F
update  
fengjiayi 已提交
523
        context[self.group_name].append(local_norm_var)
F
fengjiayi 已提交
524

F
update  
fengjiayi 已提交
525
        self.context = context
526

Y
yuyang18 已提交
527
    def _create_operators(self, param, grad):
F
update  
fengjiayi 已提交
528 529 530
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
T
tensor-tang 已提交
531
            group_norm_var = layers.sqrt(x=group_norm_var)
F
update  
fengjiayi 已提交
532 533 534
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,
F
fengjiayi 已提交
535
                y=layers.elementwise_max(
F
update  
fengjiayi 已提交
536
                    x=clip_var, y=group_norm_var))
537
            assert group_scale_var.shape == (1, )
F
update  
fengjiayi 已提交
538
            self.context[group_scale_name] = group_scale_var
F
fengjiayi 已提交
539

F
update  
fengjiayi 已提交
540 541
        new_grad = layers.elementwise_mul(
            x=grad, y=self.context[group_scale_name])
C
chengduo 已提交
542

543
        return param, new_grad
F
fengjiayi 已提交
544 545


546
@framework.dygraph_not_support
F
fengjiayi 已提交
547
def set_gradient_clip(clip, param_list=None, program=None):
F
fengjiayi 已提交
548
    """
549 550
    :api_attr: Static Graph
    
551 552 553 554
    Warning:
    
        This API must be used after building network, and before ``minimize`` , 
        and it may be removed in future releases, so it is not recommended. 
555 556 557 558
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
         :ref:`api_fluid_clip_GradientClipByValue` .
559
        
560 561 562
    To specify parameters that require gradient clip.

    Args:
563 564 565 566 567
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no 
            gradient clipping.
Z
Zeng Jinle 已提交
568
        param_list (list(Variable), optional): Parameters that require gradient clip.
569
                It can be a list of parameter or a list of parameter's name.
570
                Default None, meaning that all parameters in the program will be included.
Z
Zeng Jinle 已提交
571
        program (Program, optional): The program where parameters are located.
572 573 574 575 576 577 578
                Default None, meaning that using :ref:`api_fluid_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python
C
Chengmo 已提交
579

580 581 582
            import paddle.fluid as fluid

            def network():
C
Chengmo 已提交
583 584
                image = fluid.data(name='image', shape=[
                                   None, 28], dtype='float32')
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
                param_attr1 = fluid.ParamAttr("fc1_param")
                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
                param_attr2 = fluid.ParamAttr("fc2_param")
                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
                loss = fluid.layers.reduce_mean(fc2)
                return loss


            # network 1: clip all parameter gradient
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 2: clip parameter gradient by name
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=["fc1_param", "fc2_param"])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

610
            # network 3: clip parameter gradient by value
611 612 613 614 615 616 617 618 619
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
620
            
621
            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
622 623 624 625 626 627 628
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
                # Set the gradient clipping strategy: clip1
                fluid.clip.set_gradient_clip(clip1)
                # Set the gradient clipping strategy: clip2
629 630
                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
                sgd.minimize(loss)
631 632 633 634
                # 'set_gradient_clip' will not take effect when setting has a conflict, 
                # and the gradient clipping strategy will be 'clip2'
            
            
F
fengjiayi 已提交
635
    """
636 637
    warnings.warn("Caution! 'set_gradient_clip' is not recommended "
                  "and may be deprecated in future! "
638 639
                  "We recommend a new strategy: set 'grad_clip' "
                  "when initializing the 'optimizer'. "
640
                  "This method can reduce the mistakes, please "
641
                  "refer to documention of 'optimizer'.")
642

643
    if not isinstance(clip, ClipGradBase):
F
fengjiayi 已提交
644
        raise TypeError(
645
            "'clip' should be an instance of ClipGradBase's derived class")
F
fengjiayi 已提交
646 647
    if program is None:
        program = framework.default_main_program()
648 649 650 651 652 653 654 655 656 657

    for op in program.block(0).ops:
        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
                "op_namescope"):
            warnings.warn(
                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
            )
            break

F
fengjiayi 已提交
658 659
    if param_list is None:
        param_list = program.block(0).all_parameters()
660
    if all(isinstance(elem, six.string_types) for elem in param_list):
F
fengjiayi 已提交
661 662 663 664 665 666 667
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
F
fengjiayi 已提交
668
        param.gradient_clip_attr = copy.deepcopy(clip)
F
fengjiayi 已提交
669 670


671
def append_gradient_clip_ops(param_grads):
Y
Yu Yang 已提交
672
    context = dict()
673 674 675
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
676
        with p.block.program._optimized_guard(
677
            [p, g]), framework.name_scope('gradient_clip'):
678
            clip_attr = getattr(p, 'gradient_clip_attr', None)
Y
yuyang18 已提交
679
            if clip_attr is None:
680
                return param_grads
681
            if not isinstance(clip_attr, ClipGradBase):
Y
yuyang18 已提交
682
                raise TypeError(
683
                    "clip attribute should be an instance of GradientClipBase")
Y
Yu Yang 已提交
684

Y
yuyang18 已提交
685
            clip_attr._process_context(context=context, param=p, grad=g)
Y
yuyang18 已提交
686 687

    res = []
688
    param_new_grad_name_dict = dict()
689 690 691
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
692
        with p.block.program._optimized_guard(
693
            [p, g]), framework.name_scope('gradient_clip'):
694
            param, new_grad = clip_attr._create_operators(param=p, grad=g)
695
            param_new_grad_name_dict[param.name] = new_grad.name
696
            res.append([param, new_grad])
Y
Yu Yang 已提交
697

698
    _correct_clip_op_role_var(res, param_new_grad_name_dict)
699 700 701 702
    return res


# change wrong mapping relation between param & grad in clip op
703 704
# Note: This function is sensitive to the time cost of the network with gradient clipping 
# and should not be changed easily. If you must change, please test the time cost.
705 706 707 708
def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
    block_id_list = []
    if len(param_new_grad_name_dict) == 0:
        return
709 710
    for param, grad in params_grads:
        if grad is None:
711
            continue
712 713 714 715
        block_id = param.block.idx
        if block_id in block_id_list:
            continue
        block_id_list.append(block_id)
716 717
        for op in param.block.program.global_block().ops:
            if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
718 719 720 721 722 723
                    "op_namescope") and op.attr('op_role_var'):
                param_name = op.attr('op_role_var')[0]
                if param_name in param_new_grad_name_dict:
                    correct_p_g = [
                        param_name, param_new_grad_name_dict[param_name]
                    ]
C
Chengmo 已提交
724
                    op._set_attr('op_role_var', correct_p_g)
Y
Yu Yang 已提交
725 726


727 728 729 730
GradientClipBase = ClipGradBase
GradientClipByValue = ClipGradByValue
GradientClipByNorm = ClipGradByNorm
GradientClipByGlobalNorm = ClipGradByGlobalNorm