clip.py 20.6 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
F
fengjiayi 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
F
fengjiayi 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
F
update  
fengjiayi 已提交
14

15 16
from __future__ import print_function

F
fengjiayi 已提交
17
import copy
18
import six
F
fengjiayi 已提交
19

Y
Yu Yang 已提交
20
import functools
21 22
from . import layers
from . import framework
F
fengjiayi 已提交
23
from . import core
C
Chengmo 已提交
24
from . import name_scope
Y
Yu Yang 已提交
25

F
fengjiayi 已提交
26
__all__ = [
27
    'set_gradient_clip',
28
    'ErrorClipByValue',
F
fengjiayi 已提交
29 30 31
    'GradientClipByValue',
    'GradientClipByNorm',
    'GradientClipByGlobalNorm',
F
fengjiayi 已提交
32
]
Y
Yu Yang 已提交
33 34


F
fengjiayi 已提交
35
class BaseErrorClipAttr(object):
F
fengjiayi 已提交
36 37 38
    def __str__(self):
        raise NotImplementedError()

Y
yuyang18 已提交
39
    def _append_clip_op(self, block, grad_name):
F
fengjiayi 已提交
40 41 42 43
        raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
44 45 46
    """
    Clips tensor values to the range [min, max].

47 48
    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.
49 50 51 52 53 54 55

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
56
        will be set to ``-max`` by framework.
57 58 59 60

    Examples:
        .. code-block:: python

61 62 63 64 65 66
            import paddle.fluid as fluid
            BATCH_SIZE = 128
            CLIP_MAX = 2e-6
            CLIP_MIN = -1e-6
            prog = fluid.framework.Program()
            with fluid.program_guard(main_program=prog):
C
Chengmo 已提交
67 68
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
69 70
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
71 72
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
73 74 75 76 77 78 79
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            prog_clip.block(0).var(hidden1.name)._set_error_clip(
                fluid.clip.ErrorClipByValue(
                    max=CLIP_MAX, min=CLIP_MIN)
80 81
    """

F
fengjiayi 已提交
82 83 84 85 86 87 88 89 90
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
            min = -max
        else:
            min = float(min)
        self.max = max
        self.min = min

F
fengjiayi 已提交
91 92 93
    def __str__(self):
        return "ByValue, min=%f, max=%f" % (self.min, self.max)

Y
yuyang18 已提交
94
    def _append_clip_op(self, block, grad_name):
95 96 97 98
        clip_op_desc = block.desc.append_op()
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
W
Wu Yi 已提交
99 100
        clip_op_desc._set_attr("min", self.min)
        clip_op_desc._set_attr("max", self.max)
F
fengjiayi 已提交
101 102 103 104 105 106


def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
107
    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
W
Wu Yi 已提交
108
        fwd_var = block._var_recursive(grad_to_var[grad_n])
F
fengjiayi 已提交
109
        error_clip = getattr(fwd_var, "error_clip", None)
F
fengjiayi 已提交
110 111 112 113 114
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):
            raise TypeError(
                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
            )
F
fengjiayi 已提交
115
        if error_clip is not None:
Y
yuyang18 已提交
116
            error_clip._append_clip_op(block, grad_n)
F
fengjiayi 已提交
117 118


Y
Yu Yang 已提交
119
class BaseGradientClipAttr(object):
F
fengjiayi 已提交
120 121 122
    def __str__(self):
        raise NotImplementedError()

Y
yuyang18 已提交
123
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
124 125
        raise NotImplementedError()

Y
yuyang18 已提交
126
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
127 128 129 130
        raise NotImplementedError()


class NullGradientClipAttr(BaseGradientClipAttr):
F
fengjiayi 已提交
131 132 133
    def __str__(self):
        return "Null"

Y
yuyang18 已提交
134
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
135 136
        pass

Y
yuyang18 已提交
137
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
138 139 140 141
        return param, grad


class GradientClipByValue(BaseGradientClipAttr):
142 143 144
    """
    Clips gradient values to the range [min, max].

145
    Given a tensor ``t``, this operation clips its value to ``min`` and ``max`` inplace.
146

147 148
    - Any values less than min are set to ``min``.
    - Any values greater than max are set to ``max``.
149 150 151 152 153 154 155 156 157

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to -max by framework.

    Examples:
        .. code-block:: python

158
            import paddle.fluid as fluid
T
Tink_Y 已提交
159
            w_param_attrs = fluid.ParamAttr(name=None,
C
Chengmo 已提交
160 161
              initializer=fluid.initializer.UniformInitializer(
                  low=-1.0, high=1.0, seed=0),
162
              learning_rate=1.0,
T
Tink_Y 已提交
163
              regularizer=fluid.regularizer.L1Decay(1.0),
164
              trainable=True,
165 166
              gradient_clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
C
Chengmo 已提交
167 168
            y_predict = fluid.layers.fc(
                input=x, size=1, param_attr=w_param_attrs)
169 170
    """

Y
Yu Yang 已提交
171 172 173 174 175 176 177 178 179
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
            min = -max
        else:
            min = float(min)
        self.max = max
        self.min = min

F
fengjiayi 已提交
180 181 182
    def __str__(self):
        return "ByValue, min=%f, max=%f" % (self.min, self.max)

Y
yuyang18 已提交
183
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
184 185
        pass

Y
yuyang18 已提交
186
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
187 188 189 190
        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
        return param, new_grad


F
fengjiayi 已提交
191
class GradientClipByNorm(BaseGradientClipAttr):
C
Chengmo 已提交
192 193
    """
    Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ).
194

T
tianshuo78520a 已提交
195
    The tensor is not passed through this class, but passed through the parameter of ``main_program`` in ``fluid.program_guard``.
196

197
    This class limits the L2 norm of the input :math:`X` within :math:`clip\_norm`.
198 199

    .. math::
200
        Out =
C
Chengmo 已提交
201 202 203 204 205 206
        \\left \{
        \\begin{aligned}
        & X & & if (norm(X) \\leq clip\_norm) \\\\
        & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\
        \\end{aligned}
        \\right.
207 208 209 210


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

211
    .. math::
C
Chengmo 已提交
212
        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
213

214 215
    Args:
        clip_norm(float): The maximum norm value
C
Chengmo 已提交
216

217 218 219
    Examples:
        .. code-block:: python

220
            import paddle.fluid as fluid
221 222 223 224 225 226 227
            import paddle.fluid.core as core
            import paddle
            place = core.CPUPlace()
            prog = fluid.framework.Program()
            startup_program = fluid.framework.Program()
            with fluid.program_guard(
                        main_program=prog, startup_program=startup_program):
C
Chengmo 已提交
228 229 230 231
                image = fluid.data(
                    name='x', shape=[None, 784], dtype='float32', lod_level=0)
                label = fluid.data(
                    name='y', shape=[None, 1], dtype='int64', lod_level=0)
232 233
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
234 235
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
236 237 238 239 240 241 242 243 244 245 246 247
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
            p_g = fluid.backward.append_backward(loss=avg_cost)
            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
            with fluid.program_guard(main_program=prog_clip, startup_program=startup_program):
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByNorm(clip_norm=2.0))
                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
            grad_list = [elem[1] for elem in p_g]
            grad_clip_list = [elem[1] for elem in p_g_clip]
C
Chengmo 已提交
248
            train_reader = paddle.batch(
249 250 251
                paddle.reader.shuffle(
                    paddle.dataset.mnist.train(), buf_size=8192),
                batch_size=128)
C
Chengmo 已提交
252

253 254 255
            exe = fluid.Executor(place)
            feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
            exe.run(startup_program)
C
Chengmo 已提交
256

257 258 259 260 261 262
            count = 0
            for data in train_reader():
                count += 1
                print("count:%s" % count)
                if count > 5:
                   break
C
Chengmo 已提交
263 264
                out = exe.run(prog, feed=feeder.feed(
                    data), fetch_list=grad_list)
265 266 267
                out_clip = exe.run(prog_clip,
                                   feed=feeder.feed(data),
                                   fetch_list=grad_clip_list)
268 269 270

    """

F
fengjiayi 已提交
271 272 273
    def __init__(self, clip_norm):
        self.clip_norm = clip_norm

F
fengjiayi 已提交
274 275 276
    def __str__(self):
        return "ByNorm, clip_norm=%f" % self.clip_norm

Y
yuyang18 已提交
277
    def _process_context(self, context, param, grad):
F
fengjiayi 已提交
278 279
        pass

Y
yuyang18 已提交
280
    def _create_operators(self, param, grad):
F
fengjiayi 已提交
281 282 283 284
        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
        return param, new_grad


F
fengjiayi 已提交
285
class GradientClipByGlobalNorm(BaseGradientClipAttr):
286 287 288
    """
    Clips values of multiple tensors by the ratio of the sum of their norms.

289 290
    Given a list of tensors ``t_list`` , and a clipping ratio ``clip_norm``,
    this operation returns a instance of this class as first parameter of
C
Chengmo 已提交
291 292
    ``set_gradient_clip`` method, second parameter of ``set_gradient_clip``
    is used to compute clipped tensors list ``list_clipped`` (default value
293 294
    is ``None``, compute global norm ``global_norm`` based in all tensors).
    global norm (global_norm) of all tensors in t_list.
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317

    To perform the clipping, the values :math:`t\_list[i]` are set to:

    .. math::

        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
    otherwise they're all shrunk by the global ratio.

    Args:
        clip_norm (float): The maximum norm value
        group_name (str, optional): The group name for this clip.

    Examples:
        .. code-block:: python

318
            import paddle.fluid as fluid
319 320 321 322
            import paddle.fluid.core as core
            import paddle

            place = core.CPUPlace()
323 324 325 326
            prog = fluid.framework.Program()
            startup_program = fluid.framework.Program()
            with fluid.program_guard(
                    main_program=prog, startup_program=startup_program):
C
Chengmo 已提交
327 328
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
329 330 331
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
332 333
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
334 335
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
336

337 338
            prog_clip = prog.clone()
            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
339 340

            p_g = fluid.backward.append_backward(loss=avg_cost)
341 342
            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)

343
            with fluid.program_guard(main_program=prog_clip, startup_program=startup_program):
344 345 346 347
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)

348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
            grad_list = [elem[1] for elem in p_g]
            grad_clip_list = [elem[1] for elem in p_g_clip]

            train_reader = paddle.batch(
                paddle.reader.shuffle(
                    paddle.dataset.mnist.train(), buf_size=8192),
                batch_size=128)

            exe = fluid.Executor(place)
            feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
            exe.run(startup_program)

            count = 0
            for data in train_reader():
                count += 1
                print("count:%s" % count)
                if count > 5:
                    break
C
Chengmo 已提交
366 367
                out = exe.run(prog, feed=feeder.feed(
                    data), fetch_list=grad_list)
368 369 370 371
                out_clip = exe.run(prog_clip,
                                   feed=feeder.feed(data),
                                   fetch_list=grad_clip_list)

372 373
    """

F
update  
fengjiayi 已提交
374
    def __init__(self, clip_norm, group_name="default_group"):
375 376
        if not isinstance(group_name, six.string_types):
            raise TypeError("'group_name' must be a %s." % (six.string_types))
F
update  
fengjiayi 已提交
377 378 379

        self.clip_norm = clip_norm
        self.group_name = group_name
380

F
fengjiayi 已提交
381 382 383 384
    def __str__(self):
        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
                                                              self.clip_norm)

Y
yuyang18 已提交
385
    def _process_context(self, context, param, grad):
F
update  
fengjiayi 已提交
386 387 388 389 390 391 392 393 394 395
        if self.group_name not in context:
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
                shape=[1], dtype="float32", value=self.clip_norm)
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
F
fengjiayi 已提交
396

C
chengduo 已提交
397 398 399 400 401 402
        merge_grad = grad
        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
            merge_grad = layers.merge_selected_rows(grad)
            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)

        square = layers.square(merge_grad)
P
phlrain 已提交
403
        local_norm_var = layers.reduce_sum(input=square)
F
update  
fengjiayi 已提交
404
        context[self.group_name].append(local_norm_var)
F
fengjiayi 已提交
405

F
update  
fengjiayi 已提交
406
        self.context = context
407

Y
yuyang18 已提交
408
    def _create_operators(self, param, grad):
F
update  
fengjiayi 已提交
409 410 411
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
T
tensor-tang 已提交
412
            group_norm_var = layers.sqrt(x=group_norm_var)
F
update  
fengjiayi 已提交
413 414 415
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,
F
fengjiayi 已提交
416
                y=layers.elementwise_max(
F
update  
fengjiayi 已提交
417
                    x=clip_var, y=group_norm_var))
418
            assert group_scale_var.shape == (1, )
F
update  
fengjiayi 已提交
419
            self.context[group_scale_name] = group_scale_var
F
fengjiayi 已提交
420

F
update  
fengjiayi 已提交
421 422
        new_grad = layers.elementwise_mul(
            x=grad, y=self.context[group_scale_name])
C
chengduo 已提交
423

424
        return param, new_grad
F
fengjiayi 已提交
425 426


427
@framework.dygraph_not_support
F
fengjiayi 已提交
428
def set_gradient_clip(clip, param_list=None, program=None):
F
fengjiayi 已提交
429
    """
430 431 432
    To specify parameters that require gradient clip.

    Args:
Z
Zeng Jinle 已提交
433
        clip (BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
434
                for example :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
435
                which describes the type and detailed attributes of required gradient clip.
Z
Zeng Jinle 已提交
436
        param_list (list(Variable), optional): Parameters that require gradient clip.
437
                It can be a list of parameter or a list of parameter's name.
438
                Default None, meaning that all parameters in the program will be included.
Z
Zeng Jinle 已提交
439
        program (Program, optional): The program where parameters are located.
440 441 442 443 444 445 446
                Default None, meaning that using :ref:`api_fluid_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python
C
Chengmo 已提交
447

448 449 450
            import paddle.fluid as fluid

            def network():
C
Chengmo 已提交
451 452
                image = fluid.data(name='image', shape=[
                                   None, 28], dtype='float32')
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
                param_attr1 = fluid.ParamAttr("fc1_param")
                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
                param_attr2 = fluid.ParamAttr("fc2_param")
                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
                loss = fluid.layers.reduce_mean(fc2)
                return loss


            # network 1: clip all parameter gradient
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 2: clip parameter gradient by name
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=["fc1_param", "fc2_param"])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 3: clip parameter gradient by var
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
F
fengjiayi 已提交
488
    """
F
fengjiayi 已提交
489 490 491 492
    if not isinstance(clip, BaseGradientClipAttr):
        raise TypeError(
            "'clip' should be an instance of BaseGradientClipAttr's derived class"
        )
F
fengjiayi 已提交
493 494 495 496
    if program is None:
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
497
    if all(isinstance(elem, six.string_types) for elem in param_list):
F
fengjiayi 已提交
498 499 500 501 502 503 504
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
F
fengjiayi 已提交
505
        param.gradient_clip_attr = copy.deepcopy(clip)
F
fengjiayi 已提交
506 507


508
def append_gradient_clip_ops(param_grads):
Y
Yu Yang 已提交
509
    context = dict()
510 511 512
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
513
        with p.block.program._optimized_guard(
C
Chengmo 已提交
514
            [p, g]), framework.name_scope('append_clip_@CLIP'):
Y
yuyang18 已提交
515 516 517 518 519 520 521
            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
            if clip_attr is None:
                clip_attr = NullGradientClipAttr()
            if not isinstance(clip_attr, BaseGradientClipAttr):
                raise TypeError(
                    "clip attribute should be an instance of BaseGradientClipAttr"
                )
Y
Yu Yang 已提交
522

Y
yuyang18 已提交
523
            clip_attr._process_context(context=context, param=p, grad=g)
Y
yuyang18 已提交
524 525

    res = []
526 527 528
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
529
        with p.block.program._optimized_guard(
C
Chengmo 已提交
530
            [p, g]), framework.name_scope('append_graident_clip_@CLIP'):
Y
yuyang18 已提交
531
            res.append(clip_attr._create_operators(param=p, grad=g))
Y
Yu Yang 已提交
532

C
Chengmo 已提交
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
    # change wrong mapping relation between param & grad in clip op
    clip_flag = '@CLIP'
    for p, g in param_grads:
        if g is None:
            continue
        for op in p.block.program.global_block().ops:
            if 'op_namescope' in op.all_attrs() and clip_flag in op.attr(
                    "op_namescope"):
                if op.attr('op_role_var'):
                    param_name = op.attr('op_role_var')[0]
                    index = 0
                    for i in range(len(res)):
                        if res[i][0].name == param_name:
                            index = i
                    correct_p_g = [param_name, res[index][1].name]
                    op._set_attr('op_role_var', correct_p_g)
Y
yuyang18 已提交
549
    return res
Y
Yu Yang 已提交
550 551 552


ClipByValue = GradientClipByValue
F
fengjiayi 已提交
553 554
ClipByNorm = GradientClipByNorm
ClipByGlobalNorm = GradientClipByGlobalNorm