test_gradient_clip.py 23.3 KB
Newer Older
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
C
chengduo 已提交
2 3 4 5 6
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
C
chengduo 已提交
8 9 10 11 12 13 14 15 16 17 18 19
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
20
from fake_reader import fake_imdb_reader
21
from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
C
chengduo 已提交
22

W
WangXi 已提交
23 24
paddle.enable_static()

C
chengduo 已提交
25 26 27 28 29 30 31 32 33 34 35 36 37

def bow_net(data,
            label,
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2):
    """
    BOW net
    This model is from https://github.com/PaddlePaddle/models:
    fluid/PaddleNLP/text_classification/nets.py
    """
38 39 40
    emb = fluid.layers.embedding(input=data,
                                 is_sparse=True,
                                 size=[dict_dim, emb_dim])
C
chengduo 已提交
41 42 43 44 45 46
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
    bow_tanh = fluid.layers.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
47
    avg_cost = paddle.mean(x=cost)
C
chengduo 已提交
48 49 50 51 52

    return avg_cost


class TestGradientClip(unittest.TestCase):
53

C
chengduo 已提交
54
    def setUp(self):
55
        self.word_dict_len = 5147
C
chengduo 已提交
56
        self.BATCH_SIZE = 2
57 58
        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
zhouweiwei2014's avatar
zhouweiwei2014 已提交
59
        self.clip_gradient = lambda x: None
60 61 62 63
        self.init()

    def init(self):
        pass
C
chengduo 已提交
64 65

    def get_places(self):
66
        places = [fluid.CPUPlace()]
C
chengduo 已提交
67
        if core.is_compiled_with_cuda():
68
            places.append(fluid.CUDAPlace(0))
C
chengduo 已提交
69 70
        return places

71 72 73
    def check_clip_result(self, out, out_clip):
        pass

74
    def check_gradient_clip(self, place, dtype='float32'):
75 76
        prog = fluid.Program()
        startup_program = fluid.Program()
77 78
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
79 80
            image = fluid.data(name="a", shape=[-1, 784], dtype='float32')
            label = fluid.data(name="b", shape=[-1, 1], dtype='int64')
81 82 83 84 85
            if dtype != 'float32':
                image_cast = paddle.cast(image, dtype)
                hidden = fluid.layers.fc(input=image_cast, size=32, act='relu')
            else:
                hidden = fluid.layers.fc(input=image, size=32, act='relu')
86
            predict = fluid.layers.fc(input=hidden, size=10, act='softmax')
C
chengduo 已提交
87 88

            cost = fluid.layers.cross_entropy(input=predict, label=label)
89
            avg_cost = paddle.mean(cost)
C
chengduo 已提交
90 91 92 93 94 95 96

        prog_clip = prog.clone()
        avg_cost_clip = prog_clip.block(0).var(avg_cost.name)

        p_g = fluid.backward.append_backward(loss=avg_cost)
        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)

97 98
        p_g = sorted(p_g, key=lambda x: x[0].name)
        p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name)
99 100
        with fluid.program_guard(main_program=prog_clip,
                                 startup_program=startup_program):
101
            p_g_clip = self.clip_gradient(p_g_clip)
C
chengduo 已提交
102 103 104 105

        grad_list = [elem[1] for elem in p_g]
        grad_clip_list = [elem[1] for elem in p_g_clip]

106
        train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=3)
C
chengduo 已提交
107 108 109 110
        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
        exe.run(startup_program)

111 112 113 114 115 116
        data = next(train_reader())
        out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
        out_clip = exe.run(prog_clip,
                           feed=feeder.feed(data),
                           fetch_list=grad_clip_list)
        self.check_clip_result(out, out_clip)
C
chengduo 已提交
117 118

    def check_sparse_gradient_clip(self, place):
119 120
        prog = fluid.Program()
        startup_program = fluid.Program()
121 122 123 124 125 126
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
            data = fluid.data(name="words",
                              shape=[-1, 1],
                              dtype="int64",
                              lod_level=1)
127
            label = fluid.data(name="label", shape=[-1, 1], dtype="int64")
128
            cost = bow_net(data, label, self.word_dict_len)
C
chengduo 已提交
129

130
            self.backward_and_optimize(cost)
C
chengduo 已提交
131 132 133 134 135 136 137 138 139 140

        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
        exe.run(startup_program)

        data = next(self.train_data())
        val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
        self.assertEqual((1, ), val.shape)
        self.assertFalse(np.isnan(val))

141
    def backward_and_optimize(self, cost):
142 143 144 145
        pass


class TestGradientClipByGlobalNorm(TestGradientClip):
146

147 148 149 150 151 152
    def init(self):
        self.clip_norm = 0.2

    def check_clip_result(self, out, out_clip):
        global_norm = 0
        for v in out:
W
WangXi 已提交
153
            global_norm += np.sum(np.square(v))
154 155 156 157 158 159 160
        global_norm = np.sqrt(global_norm)
        scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
        res = []
        for i in range(len(out)):
            out[i] = scale * out[i]

        for u, v in zip(out, out_clip):
161 162 163 164 165 166 167
            np.testing.assert_allclose(
                u,
                v,
                rtol=1e-05,
                atol=1e-08,
                err_msg=
                'gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}'
168
                .format(u, v, u - v))
169

170
    # test whether the output is right when use 'set_gradient_clip'
171
    def test_old_gradient_clip(self):
172

173 174 175 176 177 178 179 180
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            fluid.clip.set_gradient_clip(clip)
            return fluid.clip.append_gradient_clip_ops(params_grads)

        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace())

181
    # test whether the output is right when use grad_clip
182
    def test_new_gradient_clip(self):
183

184 185 186
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            return clip(params_grads)
C
chengduo 已提交
187

188 189 190
        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace())

191
    # test whether the output is right when use grad_clip under float64
192
    def test_new_gradient_clip_fp64(self):
193

194 195 196 197 198 199 200
        def func(params_grads):
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
            return clip(params_grads)

        self.clip_gradient = func
        self.check_gradient_clip(fluid.CPUPlace(), "float64")

201 202
    # invoke 'set_gradient_clip' in a wrong order
    def test_wrong_API_order(self):
203

204
        def backward_func(cost):
205
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
206
            fluid.clip.set_gradient_clip(clip)
207 208 209 210
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                grad_clip=clip)
            # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
            sgd_optimizer.minimize(cost)
211 212 213 214
            # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
            fluid.clip.set_gradient_clip(clip)

        self.backward_and_optimize = backward_func
C
chengduo 已提交
215 216 217
        for place in self.get_places():
            self.check_sparse_gradient_clip(place)

218 219
    # raise typeError
    def test_tpyeError(self):
220
        # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
221
        with self.assertRaises(TypeError):
222 223
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
                                                grad_clip="test")
224

225 226 227 228
    # if grad is None or not need clip
    def test_none_grad_fp32(self):
        ops = self._test_none_grad_helper("float32")
        self.assertListEqual(ops, [
229
            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt',
230 231 232 233 234 235 236 237
            'fill_constant', 'elementwise_max', 'elementwise_div',
            'elementwise_mul', 'elementwise_mul'
        ])

    def test_none_grad_fp16(self):
        ops = self._test_none_grad_helper("float16")
        self.assertListEqual(ops, [
            'square', 'reduce_sum', 'square', 'reduce_sum', 'sum', 'cast',
238 239
            'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
            'cast', 'elementwise_mul', 'cast', 'elementwise_mul'
240 241 242 243 244
        ])

    def _test_none_grad_helper(self, dtype):
        prog = fluid.Program()
        startup_program = fluid.Program()
245 246
        with fluid.program_guard(main_program=prog,
                                 startup_program=startup_program):
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
            clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
            x = fluid.default_main_program().global_block().create_parameter(
                name="x", shape=[2, 3], dtype=dtype)
            y = fluid.default_main_program().global_block().create_parameter(
                name="y", shape=[2, 3], dtype=dtype)

            # (x, None) should not be returned
            params_grads = [(x, None), (x, y), (y, x)]
            params_grads = clip(params_grads)
            self.assertTrue(
                len(params_grads) == 2,
                "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
            )

            ops = [op.type for op in x.block.ops]
        return ops

264 265

class TestGradientClipByNorm(TestGradientClip):
266

267 268 269 270 271 272 273 274
    def init(self):
        self.clip_norm = 0.2

    def check_clip_result(self, out, out_clip):
        for u, v in zip(out, out_clip):
            norm = np.sqrt(np.sum(np.power(u, 2)))
            scale = self.clip_norm / np.maximum(self.clip_norm, norm)
            u = u * scale
275 276 277 278 279 280
            np.testing.assert_allclose(
                u,
                v,
                rtol=1e-05,
                atol=1e-08,
                err_msg='gradient clip by norm has wrong results!')
281

282
    # test whether the output is right when use grad_clip
283
    def test_gradient_clip(self):
284

zhouweiwei2014's avatar
zhouweiwei2014 已提交
285 286 287 288 289
        def func(params_grads):
            clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
            return clip(params_grads)

        self.clip_gradient = func
290 291 292 293
        self.check_gradient_clip(fluid.CPUPlace())

    # if grad is None or not need clip
    def test_none_grad(self):
294
        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
295
        x = fluid.default_main_program().global_block().create_parameter(
296
            name="x", shape=[2, 3], dtype="float32", need_clip=False)
297
        y = fluid.default_main_program().global_block().create_parameter(
298
            name="y", shape=[2, 3], dtype="float32", need_clip=False)
299 300 301 302 303 304

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
305
            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
306 307 308
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
309
            "ClipGradByNorm: grad should not be clipped when filtered out!")
310 311 312


class TestGradientClipByValue(TestGradientClip):
313

314 315 316 317 318 319 320 321 322
    def init(self):
        self.max = 0.2
        self.min = 0.1

    def check_clip_result(self, out, out_clip):
        for i, v in enumerate(out):
            out[i] = np.clip(v, self.min, self.max)
        for u, v in zip(out, out_clip):
            u = np.clip(u, self.min, self.max)
323 324 325 326 327 328
            np.testing.assert_allclose(
                u,
                v,
                rtol=1e-06,
                atol=1e-08,
                err_msg='gradient clip by value has wrong results!')
329

330
    # test whether the output is right when use grad_clip
331
    def test_gradient_clip(self):
332

zhouweiwei2014's avatar
zhouweiwei2014 已提交
333 334 335 336 337
        def func(params_grads):
            clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
            return clip(params_grads)

        self.clip_gradient = func
338 339 340 341
        self.check_gradient_clip(fluid.CPUPlace())

    # if grad is None or not need clip
    def test_none_grad(self):
342
        clip = fluid.clip.GradientClipByValue(self.max, self.min)
343
        x = fluid.default_main_program().global_block().create_parameter(
344
            name="x", shape=[2, 3], dtype="float32", need_clip=False)
345
        y = fluid.default_main_program().global_block().create_parameter(
346
            name="y", shape=[2, 3], dtype="float32", need_clip=False)
347 348 349 350 351 352

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
353
            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
354 355 356
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
357
            "ClipGradByValue: grad should not be clipped when filtered out!")
358 359 360


class TestDygraphGradientClip(unittest.TestCase):
361

362 363 364
    def test_gradient_clip(self):
        with fluid.dygraph.guard():
            linear = fluid.dygraph.Linear(5, 5)
365 366
            inputs = fluid.layers.uniform_random([16, 5], min=-10,
                                                 max=10).astype('float32')
367 368 369 370
            out = linear(fluid.dygraph.to_variable(inputs))
            loss = fluid.layers.reduce_mean(out)
            loss.backward()
            sgd_optimizer = fluid.optimizer.SGD(
371 372 373
                learning_rate=0.0,
                parameter_list=linear.parameters(),
                grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1))
374 375 376 377 378 379 380
            self.check_clip_result(loss, sgd_optimizer)

    def check_clip_result(self, loss, optimizer):
        pass


class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
381

382 383 384
    def setUp(self):
        self.clip_norm = 0.8
        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
385
            clip_norm=self.clip_norm)
386 387 388 389 390
        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
            clip_norm=self.clip_norm)

    def check_clip_result(self, loss, optimizer):
        # if grad is None
391 392 393 394
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"),
                                      name="x")
        y = fluid.dygraph.to_variable(np.array([3, 4]).astype("float32"),
                                      name="y")
395 396
        assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
        # get params and grads from network
397
        opt, params_grads = optimizer.minimize(loss)
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
        _, grads = zip(*params_grads)
        params_grads = self.clip2(params_grads)
        _, grads_clip = zip(*params_grads)

        global_norm = 0
        for u in grads:
            u = u.numpy()
            global_norm += np.sum(np.power(u, 2))
        global_norm = np.sqrt(global_norm)

        global_norm_clip = 0
        for v in grads_clip:
            v = v.numpy()
            global_norm_clip += np.sum(np.power(v, 2))
        global_norm_clip = np.sqrt(global_norm_clip)

        a = np.minimum(global_norm, self.clip_norm)
        b = global_norm_clip
        self.assertTrue(
417
            np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
418
            "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
419 420 421 422
            % (a, b))


class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
423

424 425
    def setUp(self):
        self.clip_norm = 0.8
426
        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
427 428 429 430 431 432 433

    def check_clip_result(self, loss, optimizer):
        # if grad is None
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
        assert len(self.clip([(x, None)])) == 0
        # get params and grads from network
        self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)])
434
        opt, params_grads = optimizer.minimize(loss)
435 436 437 438 439 440 441 442 443 444 445
        _, grads = zip(*params_grads)
        params_grads = self.clip(params_grads)
        _, grads_clip = zip(*params_grads)

        for u, v in zip(grads, grads_clip):
            u = u.numpy()
            v = v.numpy()
            a = np.sqrt(np.sum(np.power(u, 2)))
            a = np.minimum(a, self.clip_norm)
            b = np.sqrt(np.sum(np.power(v, 2)))
            self.assertTrue(
446
                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
447
                "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
448 449 450 451
                % (a, b))


class TestDygraphGradientClipByValue(TestDygraphGradientClip):
452

453 454 455
    def setUp(self):
        self.max = 0.2
        self.min = 0.1
456
        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
457 458 459 460 461 462

    def check_clip_result(self, loss, optimizer):
        # if grad is None
        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
        assert len(self.clip([(x, None)])) == 0
        # get params and grads from network
463
        opt, params_grads = optimizer.minimize(loss)
464 465 466 467 468 469
        _, grads = zip(*params_grads)
        params_grads = self.clip(params_grads)
        _, grads_clip = zip(*params_grads)
        for u, v in zip(grads, grads_clip):
            u = np.clip(u.numpy(), self.min, self.max)
            v = v.numpy()
470 471 472 473 474 475
            np.testing.assert_allclose(
                u,
                v,
                rtol=1e-06,
                atol=1e-08,
                err_msg='gradient clip by value has wrong results!')
476

C
chengduo 已提交
477

478
class SimpleNet(paddle.nn.Layer):
479

480 481 482 483 484 485 486 487 488 489 490 491
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.linear = paddle.nn.Linear(5, 5)
        self.batch_norm = paddle.nn.BatchNorm(5)

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        return x


class TestDygraphGradientClipFP16(unittest.TestCase):
492

493 494 495 496 497 498 499 500 501 502
    def test_gradient_clip(self):
        if fluid.core.is_compiled_with_cuda():
            with fluid.dygraph.guard():
                paddle.seed(10)
                model = SimpleNet()
                sgd_optimizer = paddle.optimizer.SGD(
                    learning_rate=0.0, parameters=model.parameters())
                model, sgd_optimizer = paddle.amp.decorate(
                    models=model, optimizers=sgd_optimizer, level='O2')
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
503 504
                inputs = fluid.layers.uniform_random([1, 5], min=-10,
                                                     max=10).astype('float32')
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
                with paddle.amp.auto_cast(level='O2'):
                    out = model(fluid.dygraph.to_variable(inputs))
                    loss = fluid.layers.reduce_mean(out)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.unscale_(sgd_optimizer)
                # before clip
                params_grads = []
                for param in model.parameters():
                    if param.stop_gradient:
                        continue
                    if param._grad_ivar() is not None:
                        params_grads.append((param, param._grad_ivar()))
                _, grads = zip(*params_grads)
                # clip grads
                clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
                params_grads = clip(params_grads)
                _, grads_clip = zip(*params_grads)
523
                # param update
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
                scaler.step(sgd_optimizer)
                scaler.update()

                global_norm = 0
                for u in grads:
                    u = u.numpy()
                    global_norm += np.sum(np.power(u, 2))
                global_norm = np.sqrt(global_norm)
                global_norm_clip = 0
                for v in grads_clip:
                    v = v.numpy()
                    global_norm_clip += np.sum(np.power(v, 2))
                global_norm_clip = np.sqrt(global_norm_clip)

                a = np.minimum(global_norm, 0.8)
                b = global_norm_clip
                self.assertTrue(
541
                    np.isclose(a=a, b=b, rtol=1e-3, atol=1e-8),
542
                    "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
543 544 545 546
                    % (a, b))


class TestDygraphGradientClipFP64(unittest.TestCase):
547

548 549
    def test_gradient_clip(self):
        with fluid.dygraph.guard():
550 551
            inputs = fluid.layers.uniform_random([16, 5], min=-10,
                                                 max=10).astype('float64')
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
            linear = fluid.dygraph.Linear(5, 5, dtype="float64")
            out = linear(fluid.dygraph.to_variable(inputs))
            loss = fluid.layers.reduce_mean(out)
            loss.backward()
            # before clip
            params_grads = []
            for param in linear.parameters():
                if param.stop_gradient:
                    continue
                if param._grad_ivar() is not None:
                    params_grads.append((param, param._grad_ivar()))
            _, grads = zip(*params_grads)
            # clip grads
            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
            params_grads = clip(params_grads)
            _, grads_clip = zip(*params_grads)

            global_norm = 0
            for u in grads:
                u = u.numpy()
                global_norm += np.sum(np.power(u, 2))
            global_norm = np.sqrt(global_norm)

            global_norm_clip = 0
            for v in grads_clip:
                v = v.numpy()
                print(v)
                global_norm_clip += np.sum(np.power(v, 2))
            global_norm_clip = np.sqrt(global_norm_clip)
            print(global_norm_clip)

            a = np.minimum(global_norm, 0.1)
            b = global_norm_clip

            self.assertTrue(
587
                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
588
                "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
589 590 591
                % (a, b))


592
class TestPureFP16ClipGradByGlobalNorm(unittest.TestCase):
593

594 595 596 597 598 599 600 601 602 603
    def check_main(self, expected_has_cast_op):
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        with paddle.static.program_guard(main_prog, startup_prog):
            names = ["p0", "p1"]
            shapes = [[2, 3], [4, 5]]

            param_and_grads = []
            main_block = main_prog.global_block()
            for name, shape in zip(names, shapes):
604 605 606 607 608 609
                p = main_block.create_parameter(name=name,
                                                shape=shape,
                                                dtype='float16')
                g = main_block.create_parameter(name=p.name + '@GRAD',
                                                shape=p.shape,
                                                dtype=p.dtype)
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
                param_and_grads.append((p, g))

            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            clip(param_and_grads)
            actual_has_cast = any(op.type == 'cast' for op in main_block.ops)
            self.assertEqual(actual_has_cast, expected_has_cast_op)

    def test_main(self):
        self.check_main(True)
        _allow_pure_fp16_global_norm_clip(True)
        self.check_main(False)
        _allow_pure_fp16_global_norm_clip(False)
        self.check_main(True)


C
chengduo 已提交
625 626
if __name__ == '__main__':
    unittest.main()