test_gradient_clip.py 23.1 KB
Newer Older
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
C
chengduo 已提交
2 3 4 5 6
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
C
chengduo 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20 21
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
22 23
import six
from fake_reader import fake_imdb_reader
24
from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
C
chengduo 已提交
25

W
WangXi 已提交
26 27
paddle.enable_static()

C
chengduo 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40

def bow_net(data,
            label,
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2):
    """
    BOW net
    This model is from https://github.com/PaddlePaddle/models:
    fluid/PaddleNLP/text_classification/nets.py
    """
41 42 43
    emb = fluid.layers.embedding(input=data,
                                 is_sparse=True,
                                 size=[dict_dim, emb_dim])
C
chengduo 已提交
44 45 46 47 48 49
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
    bow_tanh = fluid.layers.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
50
    avg_cost = paddle.mean(x=cost)
C
chengduo 已提交
51 52 53 54 55

    return avg_cost


class TestGradientClip(unittest.TestCase):
56

C
chengduo 已提交
57
    def setUp(self):
58
        self.word_dict_len = 5147
C
chengduo 已提交
59
        self.BATCH_SIZE = 2
60 61
        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
zhouweiwei2014's avatar
zhouweiwei2014 已提交
62
        self.clip_gradient = lambda x: None
63 64 65 66
        self.init()

    def init(self):
        pass
C
chengduo 已提交
67 68

    def get_places(self):
69
        places = [fluid.CPUPlace()]
C
chengduo 已提交
70
        if core.is_compiled_with_cuda():
71
            places.append(fluid.CUDAPlace(0))
C
chengduo 已提交
72 73
        return places

74 75 76
    def check_clip_result(self, out, out_clip):
        pass

77
    def check_gradient_clip(self, place, dtype='float32'):
78 79
        prog = fluid.Program()
        startup_program = fluid.Program()
80 81
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
82 83
            image = fluid.data(name="a", shape=[-1, 784], dtype='float32')
            label = fluid.data(name="b", shape=[-1, 1], dtype='int64')
84 85 86 87 88
            if dtype != 'float32':
                image_cast = paddle.cast(image, dtype)
                hidden = fluid.layers.fc(input=image_cast, size=32, act='relu')
            else:
                hidden = fluid.layers.fc(input=image, size=32, act='relu')
89
            predict = fluid.layers.fc(input=hidden, size=10, act='softmax')
C
chengduo 已提交
90 91

            cost = fluid.layers.cross_entropy(input=predict, label=label)
92
            avg_cost = paddle.mean(cost)
C
chengduo 已提交
93 94 95 96 97 98 99

        prog_clip = prog.clone()
        avg_cost_clip = prog_clip.block(0).var(avg_cost.name)

        p_g = fluid.backward.append_backward(loss=avg_cost)
        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)

100 101
        p_g = sorted(p_g, key=lambda x: x[0].name)
        p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name)
102 103
        with fluid.program_guard(main_program=prog_clip,
                                 startup_program=startup_program):
104
            p_g_clip = self.clip_gradient(p_g_clip)
C
chengduo 已提交
105 106 107 108

        grad_list = [elem[1] for elem in p_g]
        grad_clip_list = [elem[1] for elem in p_g_clip]

109
        train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=3)
C
chengduo 已提交
110 111 112 113
        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
        exe.run(startup_program)

114 115 116 117 118 119
        data = next(train_reader())
        out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
        out_clip = exe.run(prog_clip,
                           feed=feeder.feed(data),
                           fetch_list=grad_clip_list)
        self.check_clip_result(out, out_clip)
C
chengduo 已提交
120 121

    def check_sparse_gradient_clip(self, place):
122 123
        prog = fluid.Program()
        startup_program = fluid.Program()
124 125 126 127 128 129
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
            data = fluid.data(name="words",
                              shape=[-1, 1],
                              dtype="int64",
                              lod_level=1)
130
            label = fluid.data(name="label", shape=[-1, 1], dtype="int64")
131
            cost = bow_net(data, label, self.word_dict_len)
C
chengduo 已提交
132

133
            self.backward_and_optimize(cost)
C
chengduo 已提交
134 135 136 137 138 139 140 141 142 143

        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
        exe.run(startup_program)

        data = next(self.train_data())
        val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
        self.assertEqual((1, ), val.shape)
        self.assertFalse(np.isnan(val))

144
    def backward_and_optimize(self, cost):
145 146 147 148
        pass


class TestGradientClipByGlobalNorm(TestGradientClip):
149

150 151 152 153 154 155
    def init(self):
        self.clip_norm = 0.2

    def check_clip_result(self, out, out_clip):
        global_norm = 0
        for v in out:
W
WangXi 已提交
156
            global_norm += np.sum(np.square(v))
157 158 159 160 161 162 163 164
        global_norm = np.sqrt(global_norm)
        scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
        res = []
        for i in range(len(out)):
            out[i] = scale * out[i]

        for u, v in zip(out, out_clip):
            self.assertTrue(
165 166 167
                np.allclose(a=u, b=v, rtol=1e-5, atol=1e-8),
                "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}"
                .format(u, v, u - v))
168

169
    # test whether the output is right when use 'set_gradient_clip'
170
    def test_old_gradient_clip(self):
171

172 173 174 175 176 177 178 179
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            fluid.clip.set_gradient_clip(clip)
            return fluid.clip.append_gradient_clip_ops(params_grads)

        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace())

180
    # test whether the output is right when use grad_clip
181
    def test_new_gradient_clip(self):
182

183 184 185
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            return clip(params_grads)
C
chengduo 已提交
186

187 188 189
        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace())

190
    # test whether the output is right when use grad_clip under float64
191
    def test_new_gradient_clip_fp64(self):
192

193 194 195 196 197 198 199
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            return clip(params_grads)

        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace(), "float64")

200 201
    # invoke 'set_gradient_clip' in a wrong order
    def test_wrong_API_order(self):
202

203
        def backward_func(cost):
204
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
205
            fluid.clip.set_gradient_clip(clip)
206 207 208 209
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                grad_clip=clip)
            # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
            sgd_optimizer.minimize(cost)
210 211 212 213
            # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
            fluid.clip.set_gradient_clip(clip)

        self.backward_and_optimize = backward_func
C
chengduo 已提交
214 215 216
        for place in self.get_places():
            self.check_sparse_gradient_clip(place)

217 218
    # raise typeError
    def test_tpyeError(self):
219
        # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
220
        with self.assertRaises(TypeError):
221 222
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
                                                grad_clip="test")
223

224 225 226 227
    # if grad is None or not need clip
    def test_none_grad_fp32(self):
        ops = self._test_none_grad_helper("float32")
        self.assertListEqual(ops, [
228
            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt',
229 230 231 232 233 234 235 236
            'fill_constant', 'elementwise_max', 'elementwise_div',
            'elementwise_mul', 'elementwise_mul'
        ])

    def test_none_grad_fp16(self):
        ops = self._test_none_grad_helper("float16")
        self.assertListEqual(ops, [
            'square', 'reduce_sum', 'square', 'reduce_sum', 'sum', 'cast',
237 238
            'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
            'cast', 'elementwise_mul', 'cast', 'elementwise_mul'
239 240 241 242 243
        ])

    def _test_none_grad_helper(self, dtype):
        prog = fluid.Program()
        startup_program = fluid.Program()
244 245
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
            clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
            x = fluid.default_main_program().global_block().create_parameter(
                name="x", shape=[2, 3], dtype=dtype)
            y = fluid.default_main_program().global_block().create_parameter(
                name="y", shape=[2, 3], dtype=dtype)

            # (x, None) should not be returned
            params_grads = [(x, None), (x, y), (y, x)]
            params_grads = clip(params_grads)
            self.assertTrue(
                len(params_grads) == 2,
                "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
            )

            ops = [op.type for op in x.block.ops]
        return ops

263 264

class TestGradientClipByNorm(TestGradientClip):
265

266 267 268 269 270 271 272 273
    def init(self):
        self.clip_norm = 0.2

    def check_clip_result(self, out, out_clip):
        for u, v in zip(out, out_clip):
            norm = np.sqrt(np.sum(np.power(u, 2)))
            scale = self.clip_norm / np.maximum(self.clip_norm, norm)
            u = u * scale
274 275
            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-5, atol=1e-8),
                            "gradient clip by norm has wrong results!")
276

277
    # test whether the output is right when use grad_clip
278
    def test_gradient_clip(self):
279

zhouweiwei2014's avatar
zhouweiwei2014 已提交
280 281 282 283 284
        def func(params_grads):
            clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
            return clip(params_grads)

        self.clip_gradient = func
285 286 287 288
        self.check_gradient_clip(fluid.CPUPlace())

    # if grad is None or not need clip
    def test_none_grad(self):
289
        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
290
        x = fluid.default_main_program().global_block().create_parameter(
291
            name="x", shape=[2, 3], dtype="float32", need_clip=False)
292
        y = fluid.default_main_program().global_block().create_parameter(
293
            name="y", shape=[2, 3], dtype="float32", need_clip=False)
294 295 296 297 298 299

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
300
            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
301 302 303
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
304
            "ClipGradByNorm: grad should not be clipped when filtered out!")
305 306 307


class TestGradientClipByValue(TestGradientClip):
308

309 310 311 312 313 314 315 316 317
    def init(self):
        self.max = 0.2
        self.min = 0.1

    def check_clip_result(self, out, out_clip):
        for i, v in enumerate(out):
            out[i] = np.clip(v, self.min, self.max)
        for u, v in zip(out, out_clip):
            u = np.clip(u, self.min, self.max)
318 319
            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-6, atol=1e-8),
                            "gradient clip by value has wrong results!")
320

321
    # test whether the output is right when use grad_clip
322
    def test_gradient_clip(self):
323

zhouweiwei2014's avatar
zhouweiwei2014 已提交
324 325 326 327 328
        def func(params_grads):
            clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
            return clip(params_grads)

        self.clip_gradient = func
329 330 331 332
        self.check_gradient_clip(fluid.CPUPlace())

    # if grad is None or not need clip
    def test_none_grad(self):
333
        clip = fluid.clip.GradientClipByValue(self.max, self.min)
334
        x = fluid.default_main_program().global_block().create_parameter(
335
            name="x", shape=[2, 3], dtype="float32", need_clip=False)
336
        y = fluid.default_main_program().global_block().create_parameter(
337
            name="y", shape=[2, 3], dtype="float32", need_clip=False)
338 339 340 341 342 343

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
344
            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
345 346 347
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
348
            "ClipGradByValue: grad should not be clipped when filtered out!")
349 350 351


class TestDygraphGradientClip(unittest.TestCase):
352

353 354 355
    def test_gradient_clip(self):
        with fluid.dygraph.guard():
            linear = fluid.dygraph.Linear(5, 5)
356 357
            inputs = fluid.layers.uniform_random([16, 5], min=-10,
                                                 max=10).astype('float32')
358 359 360 361
            out = linear(fluid.dygraph.to_variable(inputs))
            loss = fluid.layers.reduce_mean(out)
            loss.backward()
            sgd_optimizer = fluid.optimizer.SGD(
362 363 364
                learning_rate=0.0,
                parameter_list=linear.parameters(),
                grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1))
365 366 367 368 369 370 371
            self.check_clip_result(loss, sgd_optimizer)

    def check_clip_result(self, loss, optimizer):
        pass


class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
372

373 374 375
    def setUp(self):
        self.clip_norm = 0.8
        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
376
            clip_norm=self.clip_norm)
377 378 379 380 381
        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
            clip_norm=self.clip_norm)

    def check_clip_result(self, loss, optimizer):
        # if grad is None
382 383 384 385
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"),
                                      name="x")
        y = fluid.dygraph.to_variable(np.array([3, 4]).astype("float32"),
                                      name="y")
386 387
        assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
        # get params and grads from network
388
        opt, params_grads = optimizer.minimize(loss)
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
        _, grads = zip(*params_grads)
        params_grads = self.clip2(params_grads)
        _, grads_clip = zip(*params_grads)

        global_norm = 0
        for u in grads:
            u = u.numpy()
            global_norm += np.sum(np.power(u, 2))
        global_norm = np.sqrt(global_norm)

        global_norm_clip = 0
        for v in grads_clip:
            v = v.numpy()
            global_norm_clip += np.sum(np.power(v, 2))
        global_norm_clip = np.sqrt(global_norm_clip)

        a = np.minimum(global_norm, self.clip_norm)
        b = global_norm_clip
        self.assertTrue(
408
            np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
409
            "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
410 411 412 413
            % (a, b))


class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
414

415 416
    def setUp(self):
        self.clip_norm = 0.8
417
        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
418 419 420 421 422 423 424

    def check_clip_result(self, loss, optimizer):
        # if grad is None
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
        assert len(self.clip([(x, None)])) == 0
        # get params and grads from network
        self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)])
425
        opt, params_grads = optimizer.minimize(loss)
426 427 428 429 430 431 432 433 434 435 436
        _, grads = zip(*params_grads)
        params_grads = self.clip(params_grads)
        _, grads_clip = zip(*params_grads)

        for u, v in zip(grads, grads_clip):
            u = u.numpy()
            v = v.numpy()
            a = np.sqrt(np.sum(np.power(u, 2)))
            a = np.minimum(a, self.clip_norm)
            b = np.sqrt(np.sum(np.power(v, 2)))
            self.assertTrue(
437
                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
438
                "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
439 440 441 442
                % (a, b))


class TestDygraphGradientClipByValue(TestDygraphGradientClip):
443

444 445 446
    def setUp(self):
        self.max = 0.2
        self.min = 0.1
447
        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
448 449 450 451 452 453

    def check_clip_result(self, loss, optimizer):
        # if grad is None
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
        assert len(self.clip([(x, None)])) == 0
        # get params and grads from network
454
        opt, params_grads = optimizer.minimize(loss)
455 456 457 458 459 460
        _, grads = zip(*params_grads)
        params_grads = self.clip(params_grads)
        _, grads_clip = zip(*params_grads)
        for u, v in zip(grads, grads_clip):
            u = np.clip(u.numpy(), self.min, self.max)
            v = v.numpy()
461 462
            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-6, atol=1e-8),
                            "gradient clip by value has wrong results!")
463

C
chengduo 已提交
464

465
class SimpleNet(paddle.nn.Layer):
466

467 468 469 470 471 472 473 474 475 476 477 478
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.linear = paddle.nn.Linear(5, 5)
        self.batch_norm = paddle.nn.BatchNorm(5)

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        return x


class TestDygraphGradientClipFP16(unittest.TestCase):
479

480 481 482 483 484 485 486 487 488 489
    def test_gradient_clip(self):
        if fluid.core.is_compiled_with_cuda():
            with fluid.dygraph.guard():
                paddle.seed(10)
                model = SimpleNet()
                sgd_optimizer = paddle.optimizer.SGD(
                    learning_rate=0.0, parameters=model.parameters())
                model, sgd_optimizer = paddle.amp.decorate(
                    models=model, optimizers=sgd_optimizer, level='O2')
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
490 491
                inputs = fluid.layers.uniform_random([1, 5], min=-10,
                                                     max=10).astype('float32')
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
                with paddle.amp.auto_cast(level='O2'):
                    out = model(fluid.dygraph.to_variable(inputs))
                    loss = fluid.layers.reduce_mean(out)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.unscale_(sgd_optimizer)
                # before clip
                params_grads = []
                for param in model.parameters():
                    if param.stop_gradient:
                        continue
                    if param._grad_ivar() is not None:
                        params_grads.append((param, param._grad_ivar()))
                _, grads = zip(*params_grads)
                # clip grads
                clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
                params_grads = clip(params_grads)
                _, grads_clip = zip(*params_grads)
510
                # param update
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
                scaler.step(sgd_optimizer)
                scaler.update()

                global_norm = 0
                for u in grads:
                    u = u.numpy()
                    global_norm += np.sum(np.power(u, 2))
                global_norm = np.sqrt(global_norm)
                global_norm_clip = 0
                for v in grads_clip:
                    v = v.numpy()
                    global_norm_clip += np.sum(np.power(v, 2))
                global_norm_clip = np.sqrt(global_norm_clip)

                a = np.minimum(global_norm, 0.8)
                b = global_norm_clip
                self.assertTrue(
528
                    np.isclose(a=a, b=b, rtol=1e-3, atol=1e-8),
529
                    "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
530 531 532 533
                    % (a, b))


class TestDygraphGradientClipFP64(unittest.TestCase):
534

535 536
    def test_gradient_clip(self):
        with fluid.dygraph.guard():
537 538
            inputs = fluid.layers.uniform_random([16, 5], min=-10,
                                                 max=10).astype('float64')
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
            linear = fluid.dygraph.Linear(5, 5, dtype="float64")
            out = linear(fluid.dygraph.to_variable(inputs))
            loss = fluid.layers.reduce_mean(out)
            loss.backward()
            # before clip
            params_grads = []
            for param in linear.parameters():
                if param.stop_gradient:
                    continue
                if param._grad_ivar() is not None:
                    params_grads.append((param, param._grad_ivar()))
            _, grads = zip(*params_grads)
            # clip grads
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
            params_grads = clip(params_grads)
            _, grads_clip = zip(*params_grads)

            global_norm = 0
            for u in grads:
                u = u.numpy()
                global_norm += np.sum(np.power(u, 2))
            global_norm = np.sqrt(global_norm)

            global_norm_clip = 0
            for v in grads_clip:
                v = v.numpy()
                print(v)
                global_norm_clip += np.sum(np.power(v, 2))
            global_norm_clip = np.sqrt(global_norm_clip)
            print(global_norm_clip)

            a = np.minimum(global_norm, 0.1)
            b = global_norm_clip

            self.assertTrue(
574
                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
575
                "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
576 577 578
                % (a, b))


579
class TestPureFP16ClipGradByGlobalNorm(unittest.TestCase):
580

581 582 583 584 585 586 587 588 589 590
    def check_main(self, expected_has_cast_op):
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        with paddle.static.program_guard(main_prog, startup_prog):
            names = ["p0", "p1"]
            shapes = [[2, 3], [4, 5]]

            param_and_grads = []
            main_block = main_prog.global_block()
            for name, shape in zip(names, shapes):
591 592 593 594 595 596
                p = main_block.create_parameter(name=name,
                                                shape=shape,
                                                dtype='float16')
                g = main_block.create_parameter(name=p.name + '@GRAD',
                                                shape=p.shape,
                                                dtype=p.dtype)
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
                param_and_grads.append((p, g))

            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            clip(param_and_grads)
            actual_has_cast = any(op.type == 'cast' for op in main_block.ops)
            self.assertEqual(actual_has_cast, expected_has_cast_op)

    def test_main(self):
        self.check_main(True)
        _allow_pure_fp16_global_norm_clip(True)
        self.check_main(False)
        _allow_pure_fp16_global_norm_clip(False)
        self.check_main(True)


C
chengduo 已提交
612 613
if __name__ == '__main__':
    unittest.main()