clip.py 28.6 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
F
fengjiayi 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
F
fengjiayi 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
F
update  
fengjiayi 已提交
14

15 16
from __future__ import print_function

F
fengjiayi 已提交
17
import copy
18
import six
19
import warnings
F
fengjiayi 已提交
20

Y
Yu Yang 已提交
21
import functools
22 23
from . import layers
from . import framework
F
fengjiayi 已提交
24
from . import core
C
Chengmo 已提交
25
from . import name_scope
26
from .dygraph import base as imperative_base
Y
Yu Yang 已提交
27

F
fengjiayi 已提交
28
__all__ = [
29 30
    'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
    'GradientClipByNorm', 'GradientClipByGlobalNorm'
F
fengjiayi 已提交
31
]
Y
Yu Yang 已提交
32 33


F
fengjiayi 已提交
34
class BaseErrorClipAttr(object):
F
fengjiayi 已提交
35 36 37
    def __str__(self):
        raise NotImplementedError()

Y
yuyang18 已提交
38
    def _append_clip_op(self, block, grad_name):
F
fengjiayi 已提交
39 40 41 42
        raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
43 44 45
    """
    Clips tensor values to the range [min, max].

46 47
    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.
48 49 50 51 52 53 54

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
55
        will be set to ``-max`` by framework.
56 57 58 59

    Examples:
        .. code-block:: python

60 61 62 63 64 65
            import paddle.fluid as fluid
            BATCH_SIZE = 128
            CLIP_MAX = 2e-6
            CLIP_MIN = -1e-6
            prog = fluid.framework.Program()
            with fluid.program_guard(main_program=prog):
C
Chengmo 已提交
66 67
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
68 69
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
70 71
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
72 73 74 75 76 77 78
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            prog_clip.block(0).var(hidden1.name)._set_error_clip(
                fluid.clip.ErrorClipByValue(
                    max=CLIP_MAX, min=CLIP_MIN)
79 80
    """

F
fengjiayi 已提交
81 82 83 84 85 86 87 88 89
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
            min = -max
        else:
            min = float(min)
        self.max = max
        self.min = min

F
fengjiayi 已提交
90 91 92
    def __str__(self):
        return "ByValue, min=%f, max=%f" % (self.min, self.max)

Y
yuyang18 已提交
93
    def _append_clip_op(self, block, grad_name):
94 95 96 97
        clip_op_desc = block.desc.append_op()
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
W
Wu Yi 已提交
98 99
        clip_op_desc._set_attr("min", self.min)
        clip_op_desc._set_attr("max", self.max)
F
fengjiayi 已提交
100 101 102 103 104 105


def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
106
    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
W
Wu Yi 已提交
107
        fwd_var = block._var_recursive(grad_to_var[grad_n])
F
fengjiayi 已提交
108
        error_clip = getattr(fwd_var, "error_clip", None)
F
fengjiayi 已提交
109 110 111 112 113
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):
            raise TypeError(
                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
            )
F
fengjiayi 已提交
114
        if error_clip is not None:
Y
yuyang18 已提交
115
            error_clip._append_clip_op(block, grad_n)
F
fengjiayi 已提交
116 117


118 119 120 121 122 123 124 125 126 127 128
class GradientClipBase(object):
    def __init__(self, need_clip=None):
        if need_clip is not None and not callable(need_clip):
            raise TypeError(
                "The type of need_clip must be funciton, and it can filter out "
                "parameter that does't need gradient clip. This function must return "
                "True or False, and True means that clipping is required. Please refer to "
                "API documention of GradientClipByGlobalNorm / GradientClipByNorm "
                "/GradientClipByValue.")
        self._need_clip_func = need_clip

F
fengjiayi 已提交
129 130 131
    def __str__(self):
        raise NotImplementedError()

132 133 134
    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
135

136 137
    def _static_clip(self, params_grads):
        raise NotImplementedError
Y
Yu Yang 已提交
138

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
    def __call__(self, params_grads):
        assert len(
            params_grads
        ) > 0, "The number of trainable parameters should be greater than 0."
        if framework.in_dygraph_mode():
            return self._dygraph_clip(params_grads)
        else:
            for p, g in params_grads:
                if getattr(p, 'gradient_clip_attr', None) is not None:
                    warnings.warn(
                        "'set_gradient_clip' will be ineffective, because you have "
                        "pass 'grad_clip' into 'minimize'. So, 'set_gradient_clip' "
                        "is redundant and you can remove it.")
                    break
            return self._static_clip(params_grads)
F
fengjiayi 已提交
154

Y
yuyang18 已提交
155
    def _process_context(self, context, param, grad):
156
        raise NotImplementedError()
Y
Yu Yang 已提交
157

Y
yuyang18 已提交
158
    def _create_operators(self, param, grad):
159
        raise NotImplementedError()
Y
Yu Yang 已提交
160 161


162
class GradientClipByValue(GradientClipBase):
163 164 165
    """
    Clips gradient values to the range [min, max].

166
    Given a tensor ``t``, this operation clips its value to ``min`` and ``max`` inplace.
167

168 169
    - Any values less than min are set to ``min``.
    - Any values greater than max are set to ``max``.
170 171 172 173 174 175 176 177 178

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to -max by framework.

    Examples:
        .. code-block:: python

179
            import paddle.fluid as fluid
T
Tink_Y 已提交
180
            w_param_attrs = fluid.ParamAttr(name=None,
C
Chengmo 已提交
181 182
              initializer=fluid.initializer.UniformInitializer(
                  low=-1.0, high=1.0, seed=0),
183
              learning_rate=1.0,
T
Tink_Y 已提交
184
              regularizer=fluid.regularizer.L1Decay(1.0),
185
              trainable=True,
186 187
              gradient_clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
C
Chengmo 已提交
188 189
            y_predict = fluid.layers.fc(
                input=x, size=1, param_attr=w_param_attrs)
190 191
    """

192 193
    def __init__(self, max, min=None, need_clip=None):
        super(GradientClipByValue, self).__init__(need_clip)
Y
Yu Yang 已提交
194
        if min is None:
195
            assert (max > 0.0)
Y
Yu Yang 已提交
196
            min = -max
197 198
        self.max = float(max)
        self.min = float(min)
Y
Yu Yang 已提交
199

F
fengjiayi 已提交
200
    def __str__(self):
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
        return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)

    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip(x=g, min=self.min, max=self.max)
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
                if self._need_clip_func is not None and not self._need_clip_func(
                        p):
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
                params_and_grads.append((p, new_grad))
        _correct_clip_op_role_var(params_and_grads)
        return params_and_grads
F
fengjiayi 已提交
232

Y
yuyang18 已提交
233
    def _process_context(self, context, param, grad):
Y
Yu Yang 已提交
234 235
        pass

Y
yuyang18 已提交
236
    def _create_operators(self, param, grad):
Y
Yu Yang 已提交
237 238 239 240
        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
        return param, new_grad


241
class GradientClipByNorm(GradientClipBase):
C
Chengmo 已提交
242 243
    """
    Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ).
244

T
tianshuo78520a 已提交
245
    The tensor is not passed through this class, but passed through the parameter of ``main_program`` in ``fluid.program_guard``.
246

247
    This class limits the L2 norm of the input :math:`X` within :math:`clip\_norm`.
248 249

    .. math::
250
        Out =
C
Chengmo 已提交
251 252 253 254 255 256
        \\left \{
        \\begin{aligned}
        & X & & if (norm(X) \\leq clip\_norm) \\\\
        & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\
        \\end{aligned}
        \\right.
257 258 259 260


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

261
    .. math::
C
Chengmo 已提交
262
        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
263

264 265
    Args:
        clip_norm(float): The maximum norm value
C
Chengmo 已提交
266

267 268 269
    Examples:
        .. code-block:: python

270
            import paddle.fluid as fluid
271 272 273 274 275 276 277
            import paddle.fluid.core as core
            import paddle
            place = core.CPUPlace()
            prog = fluid.framework.Program()
            startup_program = fluid.framework.Program()
            with fluid.program_guard(
                        main_program=prog, startup_program=startup_program):
C
Chengmo 已提交
278 279 280 281
                image = fluid.data(
                    name='x', shape=[None, 784], dtype='float32', lod_level=0)
                label = fluid.data(
                    name='y', shape=[None, 1], dtype='int64', lod_level=0)
282 283
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
284 285
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
286 287 288 289 290 291 292 293 294 295 296 297
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
            p_g = fluid.backward.append_backward(loss=avg_cost)
            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
            with fluid.program_guard(main_program=prog_clip, startup_program=startup_program):
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByNorm(clip_norm=2.0))
                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
            grad_list = [elem[1] for elem in p_g]
            grad_clip_list = [elem[1] for elem in p_g_clip]
C
Chengmo 已提交
298
            train_reader = paddle.batch(
299 300 301
                paddle.reader.shuffle(
                    paddle.dataset.mnist.train(), buf_size=8192),
                batch_size=128)
C
Chengmo 已提交
302

303 304 305
            exe = fluid.Executor(place)
            feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
            exe.run(startup_program)
C
Chengmo 已提交
306

307 308 309 310 311 312
            count = 0
            for data in train_reader():
                count += 1
                print("count:%s" % count)
                if count > 5:
                   break
C
Chengmo 已提交
313 314
                out = exe.run(prog, feed=feeder.feed(
                    data), fetch_list=grad_list)
315 316 317
                out_clip = exe.run(prog_clip,
                                   feed=feeder.feed(data),
                                   fetch_list=grad_clip_list)
318 319 320

    """

321 322 323
    def __init__(self, clip_norm, need_clip=None):
        super(GradientClipByNorm, self).__init__(need_clip)
        self.clip_norm = float(clip_norm)
F
fengjiayi 已提交
324

F
fengjiayi 已提交
325
    def __str__(self):
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm

    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
            params_and_grads.append((p, new_grad))
        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
                if self._need_clip_func is not None and not self._need_clip_func(
                        p):
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
                params_and_grads.append((p, new_grad))
        _correct_clip_op_role_var(params_and_grads)
        return params_and_grads
F
fengjiayi 已提交
357

Y
yuyang18 已提交
358
    def _process_context(self, context, param, grad):
F
fengjiayi 已提交
359 360
        pass

Y
yuyang18 已提交
361
    def _create_operators(self, param, grad):
F
fengjiayi 已提交
362 363 364 365
        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
        return param, new_grad


366
class GradientClipByGlobalNorm(GradientClipBase):
367 368 369
    """
    Clips values of multiple tensors by the ratio of the sum of their norms.

370 371
    Given a list of tensors ``t_list`` , and a clipping ratio ``clip_norm``,
    this operation returns a instance of this class as first parameter of
C
Chengmo 已提交
372 373
    ``set_gradient_clip`` method, second parameter of ``set_gradient_clip``
    is used to compute clipped tensors list ``list_clipped`` (default value
374 375
    is ``None``, compute global norm ``global_norm`` based in all tensors).
    global norm (global_norm) of all tensors in t_list.
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398

    To perform the clipping, the values :math:`t\_list[i]` are set to:

    .. math::

        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
    otherwise they're all shrunk by the global ratio.

    Args:
        clip_norm (float): The maximum norm value
        group_name (str, optional): The group name for this clip.

    Examples:
        .. code-block:: python

399
            import paddle.fluid as fluid
400 401 402 403
            import paddle.fluid.core as core
            import paddle

            place = core.CPUPlace()
404 405 406 407
            prog = fluid.framework.Program()
            startup_program = fluid.framework.Program()
            with fluid.program_guard(
                    main_program=prog, startup_program=startup_program):
C
Chengmo 已提交
408 409
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
410 411 412
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
C
Chengmo 已提交
413 414
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
415 416
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
417

418 419
            prog_clip = prog.clone()
            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
420 421

            p_g = fluid.backward.append_backward(loss=avg_cost)
422 423
            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)

424
            with fluid.program_guard(main_program=prog_clip, startup_program=startup_program):
425 426 427 428
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)

429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
            grad_list = [elem[1] for elem in p_g]
            grad_clip_list = [elem[1] for elem in p_g_clip]

            train_reader = paddle.batch(
                paddle.reader.shuffle(
                    paddle.dataset.mnist.train(), buf_size=8192),
                batch_size=128)

            exe = fluid.Executor(place)
            feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
            exe.run(startup_program)

            count = 0
            for data in train_reader():
                count += 1
                print("count:%s" % count)
                if count > 5:
                    break
C
Chengmo 已提交
447 448
                out = exe.run(prog, feed=feeder.feed(
                    data), fetch_list=grad_list)
449 450 451 452
                out_clip = exe.run(prog_clip,
                                   feed=feeder.feed(data),
                                   fetch_list=grad_clip_list)

453 454
    """

455 456 457
    def __init__(self, clip_norm, group_name="default_group", need_clip=None):
        super(GradientClipByGlobalNorm, self).__init__(need_clip)
        self.clip_norm = float(clip_norm)
F
update  
fengjiayi 已提交
458
        self.group_name = group_name
459

F
fengjiayi 已提交
460
    def __str__(self):
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)

    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(p):
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype='float32', value=self.clip_norm)
        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(
                x=global_norm_var, y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads

    def _static_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        with framework.name_scope('gradient_clip'):
            for p, g in params_grads:
                if g is None:
                    continue
                if self._need_clip_func is not None and not self._need_clip_func(
                        p):
                    continue
                merge_grad = g
                with p.block.program._optimized_guard([p, g]):
                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                        merge_grad = layers.merge_selected_rows(g)
                        merge_grad = layers.get_tensor_from_selected_rows(
                            merge_grad)

                    square = layers.square(merge_grad)
                    sum_square = layers.reduce_sum(input=square)
                    sum_square_list.append(sum_square)

            # all parameters have been filterd out
            if len(sum_square_list) == 0:
                return params_grads

            with p.block.program._optimized_guard([p, g]):
                global_norm_var = layers.sums(sum_square_list)
                global_norm_var = layers.sqrt(x=global_norm_var)
                max_global_norm = layers.fill_constant(
                    shape=[1], dtype="float32", value=self.clip_norm)
                scale_var = layers.elementwise_div(
                    x=max_global_norm,
                    y=layers.elementwise_max(
                        x=max_global_norm, y=global_norm_var))

            for p, g in params_grads:
                if g is None:
                    continue
                if self._need_clip_func is not None and not self._need_clip_func(
                        p):
                    params_and_grads.append((p, g))
                    continue

                with p.block.program._optimized_guard([p, g]):
                    new_grad = layers.elementwise_mul(x=g, y=scale_var)
                params_and_grads.append((p, new_grad))

        _correct_clip_op_role_var(params_and_grads)
        return params_and_grads
F
fengjiayi 已提交
553

Y
yuyang18 已提交
554
    def _process_context(self, context, param, grad):
F
update  
fengjiayi 已提交
555 556 557 558 559 560 561 562 563 564
        if self.group_name not in context:
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
                shape=[1], dtype="float32", value=self.clip_norm)
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
F
fengjiayi 已提交
565

C
chengduo 已提交
566 567 568 569 570 571
        merge_grad = grad
        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
            merge_grad = layers.merge_selected_rows(grad)
            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)

        square = layers.square(merge_grad)
P
phlrain 已提交
572
        local_norm_var = layers.reduce_sum(input=square)
F
update  
fengjiayi 已提交
573
        context[self.group_name].append(local_norm_var)
F
fengjiayi 已提交
574

F
update  
fengjiayi 已提交
575
        self.context = context
576

Y
yuyang18 已提交
577
    def _create_operators(self, param, grad):
F
update  
fengjiayi 已提交
578 579 580
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
T
tensor-tang 已提交
581
            group_norm_var = layers.sqrt(x=group_norm_var)
F
update  
fengjiayi 已提交
582 583 584
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,
F
fengjiayi 已提交
585
                y=layers.elementwise_max(
F
update  
fengjiayi 已提交
586
                    x=clip_var, y=group_norm_var))
587
            assert group_scale_var.shape == (1, )
F
update  
fengjiayi 已提交
588
            self.context[group_scale_name] = group_scale_var
F
fengjiayi 已提交
589

F
update  
fengjiayi 已提交
590 591
        new_grad = layers.elementwise_mul(
            x=grad, y=self.context[group_scale_name])
C
chengduo 已提交
592

593
        return param, new_grad
F
fengjiayi 已提交
594 595


596
@framework.dygraph_not_support
F
fengjiayi 已提交
597
def set_gradient_clip(clip, param_list=None, program=None):
F
fengjiayi 已提交
598
    """
599 600 601
    To specify parameters that require gradient clip.

    Args:
Z
Zeng Jinle 已提交
602
        clip (BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
603
                for example :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
604
                which describes the type and detailed attributes of required gradient clip.
Z
Zeng Jinle 已提交
605
        param_list (list(Variable), optional): Parameters that require gradient clip.
606
                It can be a list of parameter or a list of parameter's name.
607
                Default None, meaning that all parameters in the program will be included.
Z
Zeng Jinle 已提交
608
        program (Program, optional): The program where parameters are located.
609 610 611 612 613 614 615
                Default None, meaning that using :ref:`api_fluid_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python
C
Chengmo 已提交
616

617 618 619
            import paddle.fluid as fluid

            def network():
C
Chengmo 已提交
620 621
                image = fluid.data(name='image', shape=[
                                   None, 28], dtype='float32')
622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
                param_attr1 = fluid.ParamAttr("fc1_param")
                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
                param_attr2 = fluid.ParamAttr("fc2_param")
                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
                loss = fluid.layers.reduce_mean(fc2)
                return loss


            # network 1: clip all parameter gradient
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 2: clip parameter gradient by name
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=["fc1_param", "fc2_param"])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 3: clip parameter gradient by var
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
F
fengjiayi 已提交
657
    """
658 659 660 661 662 663 664 665
    warnings.warn("Caution! 'set_gradient_clip' is not recommended "
                  "and may be deprecated in future! "
                  "We recommend a new strategy: clip gradient by "
                  "'optimizer.minimize(loss, grad_clip=clip)'. "
                  "This method can reduce the mistakes, please "
                  "see documention of 'optimzier.minimize'.")

    if not isinstance(clip, GradientClipBase):
F
fengjiayi 已提交
666
        raise TypeError(
667
            "'clip' should be an instance of GradientClipBase's derived class")
F
fengjiayi 已提交
668 669
    if program is None:
        program = framework.default_main_program()
670 671 672 673 674 675 676 677 678 679

    for op in program.block(0).ops:
        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
                "op_namescope"):
            warnings.warn(
                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
            )
            break

F
fengjiayi 已提交
680 681
    if param_list is None:
        param_list = program.block(0).all_parameters()
682
    if all(isinstance(elem, six.string_types) for elem in param_list):
F
fengjiayi 已提交
683 684 685 686 687 688 689
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
F
fengjiayi 已提交
690
        param.gradient_clip_attr = copy.deepcopy(clip)
F
fengjiayi 已提交
691 692


693
def append_gradient_clip_ops(param_grads):
Y
Yu Yang 已提交
694
    context = dict()
695 696 697
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
698
        with p.block.program._optimized_guard(
699 700
            [p, g]), framework.name_scope('gradient_clip_@CLIP'):
            clip_attr = getattr(p, 'gradient_clip_attr', None)
Y
yuyang18 已提交
701
            if clip_attr is None:
702 703
                return param_grads
            if not isinstance(clip_attr, GradientClipBase):
Y
yuyang18 已提交
704
                raise TypeError(
705
                    "clip attribute should be an instance of GradientClipBase")
Y
Yu Yang 已提交
706

Y
yuyang18 已提交
707
            clip_attr._process_context(context=context, param=p, grad=g)
Y
yuyang18 已提交
708 709

    res = []
710 711 712
    for p, g in param_grads:
        if g is None:
            continue
X
Xin Pan 已提交
713
        with p.block.program._optimized_guard(
714
            [p, g]), framework.name_scope('graident_clip_@CLIP'):
715 716
            param, new_grad = clip_attr._create_operators(param=p, grad=g)
            res.append([param, new_grad])
Y
Yu Yang 已提交
717

718 719 720 721 722 723 724 725
    _correct_clip_op_role_var(res)
    return res


# change wrong mapping relation between param & grad in clip op
def _correct_clip_op_role_var(params_grads):
    for param, grad in params_grads:
        if grad is None:
726
            continue
727 728
        for op in param.block.program.global_block().ops:
            if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
C
Chengmo 已提交
729 730 731
                    "op_namescope"):
                if op.attr('op_role_var'):
                    param_name = op.attr('op_role_var')[0]
732 733 734 735 736
                    index = 0
                    for i in range(len(params_grads)):
                        if params_grads[i][0].name == param_name:
                            index = i
                    correct_p_g = [param_name, params_grads[index][1].name]
C
Chengmo 已提交
737
                    op._set_attr('op_role_var', correct_p_g)
Y
Yu Yang 已提交
738 739 740


ClipByValue = GradientClipByValue
F
fengjiayi 已提交
741 742
ClipByNorm = GradientClipByNorm
ClipByGlobalNorm = GradientClipByGlobalNorm