test_imperative_auto_prune.py 18.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.fluid.framework import _test_eager_guard
22
from paddle.nn import Embedding
23
from paddle.tensor import random
24 25 26


class AutoPruneLayer0(fluid.Layer):
27
    def __init__(self, input_size):
28
        super().__init__()
29
        self.linear1 = paddle.nn.Linear(
30
            input_size,
31
            5,
32 33 34
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
35 36
            bias_attr=False,
        )
37
        self.linear2 = paddle.nn.Linear(
38
            5,
39
            5,
40 41 42
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
43 44
            bias_attr=False,
        )
45 46

    def forward(self, x, y):
47 48
        a = self.linear1(x)
        b = self.linear2(y)
49
        c = fluid.layers.mul(a, b)
50
        d = paddle.mean(c)
51 52 53 54
        return d


class AutoPruneLayer1(fluid.Layer):
55
    def __init__(self, input_size):
56
        super().__init__()
57
        self.linear1 = paddle.nn.Linear(
58
            input_size,
59
            5,
60 61 62
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
63 64
            bias_attr=False,
        )
65
        self.linear2 = paddle.nn.Linear(
66
            5,
67
            5,
68 69 70
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
71 72
            bias_attr=False,
        )
73 74

    def forward(self, x, y):
75 76
        a = self.linear1(x)
        b = self.linear2(y)
77 78
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
79
        d = paddle.mean(c)
80 81 82 83
        return d


class AutoPruneLayer2(fluid.Layer):
84
    def __init__(self, input_size):
85
        super().__init__()
86 87
        self.linear = paddle.nn.Linear(input_size, 10)
        self.linear2 = paddle.nn.Linear(1, 1)
88 89

    def forward(self, x, label):
90 91
        feature = self.linear(x)
        label = self.linear2(label)
92 93
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
94 95 96 97
        # Note that the label is not persistable in paddle.nn.functional.cross_entropy.
        loss = paddle.nn.functional.cross_entropy(
            input=feature, label=label, reduction='none', use_softmax=False
        )
98
        loss = paddle.mean(loss)
99 100 101 102
        return loss


class AutoPruneLayer3(fluid.Layer):
103
    def __init__(self, input_size):
104
        super().__init__()
105
        self.linear = paddle.nn.Linear(input_size, 20)
106 107

    def forward(self, x, label, test_num):
108
        feature = self.linear(x)
109 110 111
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1
        )
112
        # Note that: part2 is not used.
113 114 115
        loss = paddle.nn.functional.cross_entropy(
            input=part1, label=label, reduction='none', use_softmax=False
        )
116
        loss = paddle.mean(loss)
117 118 119 120 121 122 123
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
124
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
125
        super().__init__(dtype=dtype)
126 127
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
128 129
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
130 131

    def forward(self, x):
132
        # this method involves only the linear layers
133
        loss = paddle.mean(self.linear_0(x) + self.linear_1(x))
134 135 136
        return loss

    def linear0(self, x):
137
        loss = paddle.mean(self.linear_0(x))
138 139 140
        return loss

    def embed_linear0(self, x):
141
        loss = paddle.mean(self.linear_0(self.embed0(x)))
142 143 144 145
        return loss


class MyLayer2(fluid.Layer):
146
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
147
        super().__init__(dtype=dtype)
148 149
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
150 151
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
152 153 154 155

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
156
        loss = paddle.mean(
157 158 159
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
160 161 162
        return loss

    def linear0(self, x):
163
        loss = paddle.mean(self.linear_0(x))
164 165 166
        return loss

    def embed_linear0(self, x):
167
        loss = paddle.mean(self.linear_0(self.embed0(x)))
168 169 170 171
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
172
    def func_auto_prune(self):
173
        with fluid.dygraph.guard():
174
            case1 = AutoPruneLayer0(input_size=5)
175 176 177 178 179 180
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
181 182
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
183

184 185 186 187 188 189
    def test_auto_prune(self):
        with _test_eager_guard():
            self.func_auto_prune()
        self.func_auto_prune()

    def func_auto_prune2(self):
190
        with fluid.dygraph.guard():
191
            case2 = AutoPruneLayer1(input_size=5)
192 193 194 195 196
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
197

198
            loss.backward()
199 200
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
201

202 203 204 205 206
    def test_auto_prune2(self):
        with _test_eager_guard():
            self.func_auto_prune2()
        self.func_auto_prune2()

207
    # TODO(jiabin): Support this when we support better split tensor
208
    def func_auto_prune3(self):
209
        with fluid.dygraph.guard():
210
            case3 = AutoPruneLayer3(input_size=784)
211 212 213 214 215 216
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
217
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
218 219
            self.assertTrue((part2.gradient() == 0).all())

220
    def test_auto_prune3(self):
221
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
222 223 224
        with _test_eager_guard():
            self.func_auto_prune3()
        self.func_auto_prune3()
225
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
226 227

    def func_auto_prune4(self):
228
        with fluid.dygraph.guard():
229
            case4 = AutoPruneLayer3(input_size=784)
230 231 232 233 234 235
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
236
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
237 238
            self.assertTrue((part2.gradient() == 1).all())

239
    def test_auto_prune4(self):
240
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
241 242 243
        with _test_eager_guard():
            self.func_auto_prune4()
        self.func_auto_prune4()
244
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
245 246

    def func_auto_prune5(self):
247
        with fluid.dygraph.guard():
248
            case4 = AutoPruneLayer3(input_size=784)
249 250 251 252 253 254
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
255
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
256 257
            self.assertTrue((part2.gradient() == 0).all())

258
    def test_auto_prune5(self):
259
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
260 261 262
        with _test_eager_guard():
            self.func_auto_prune5()
        self.func_auto_prune5()
263
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
264

265
    def func_auto_prune6(self):
266 267 268 269
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
270 271
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
272 273 274
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
275 276
            out1 = linear(a)
            out2 = linear2(b)
277 278 279
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
280 281
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
282

283 284 285 286 287 288
    def test_auto_prune6(self):
        with _test_eager_guard():
            self.func_auto_prune6()
        self.func_auto_prune6()

    def func_auto_prune7(self):
289 290 291 292
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
293 294
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
295 296 297
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
298 299
            out1 = linear(a)
            out2 = linear2(b)
300 301
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
302
            out.backward()
303 304
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
305

306 307 308 309 310 311
    def test_auto_prune7(self):
        with _test_eager_guard():
            self.func_auto_prune7()
        self.func_auto_prune7()

    def func_auto_prune8(self):
312 313 314 315
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
316 317
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
318 319 320
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
321 322 323 324 325
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
326
            out2.backward()
327 328
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
329 330
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
331
            optimizer.minimize(out2)
332 333 334
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
335
            self.assertFalse(
336 337
                np.array_equal(linear_origin, linear.weight.numpy())
            )
338

339 340 341 342 343 344
    def test_auto_prune8(self):
        with _test_eager_guard():
            self.func_auto_prune8()
        self.func_auto_prune8()

    def func_auto_prune9(self):
345 346 347 348
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
349 350
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
351 352 353
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
354 355 356 357
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
358 359
            out2.stop_gradient = True
            out2.backward()
360 361
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
362 363
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
364
            optimizer.minimize(out2)
365 366 367
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
368
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
369
            try:
370
                linear2.weight.gradient()
371 372 373
            except ValueError as e:
                assert type(e) == ValueError

374 375 376 377 378 379
    def test_auto_prune9(self):
        with _test_eager_guard():
            self.func_auto_prune9()
        self.func_auto_prune9()

    def func_auto_prune10(self):
380 381 382 383
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
384 385
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
386 387 388
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
389 390
            out1 = linear(a)
            out2 = linear2(b)
391 392
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
393
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
394 395
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
396 397
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
398

399 400 401 402 403 404
    def test_auto_prune10(self):
        with _test_eager_guard():
            self.func_auto_prune10()
        self.func_auto_prune10()

    def func_auto_prune_with_optimizer(self):
405 406 407 408
        vocab_size = 100
        size = 20
        batch_size = 16

409 410 411
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
412 413 414 415
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
416
            model = MyLayer(size, vocab_size, size)
417
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
418
            optimizer = fluid.optimizer.AdamOptimizer(
419 420
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
421
            indices = fluid.dygraph.to_variable(indices)
422
            embed = fluid.dygraph.to_variable(embed)
423 424 425 426
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
427
            _, params_grads = optimizer.minimize(loss)
428
            for items in params_grads:
429
                assert items[0].name is not model.embed1.weight.name
430
                assert items[0].name is not model.linear_1.weight.name
431
            assert model.embed1.weight._grad_ivar() is None
432
            assert model.linear_1.weight._grad_ivar() is None
433 434

        with fluid.dygraph.guard(place):
435
            model = MyLayer2(size, vocab_size, size)
436
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
437
            optimizer = fluid.optimizer.AdamOptimizer(
438 439
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
440 441 442 443 444 445 446

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
447
            optimizer.minimize(loss)
448
            for items in params_grads:
449
                assert items[0].name is not model.embed1.weight.name
450
                assert items[0].name is not model.linear_1.weight.name
451
            assert model.embed1.weight._grad_ivar() is None
452
            assert model.linear_1.weight._grad_ivar() is None
453

454 455 456 457 458 459
    def test_auto_prune_with_optimizer(self):
        with _test_eager_guard():
            self.func_auto_prune_with_optimizer()
        self.func_auto_prune_with_optimizer()

    def func_case2_prune_no_grad_branch(self):
460 461 462 463 464
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
465
            case3 = AutoPruneLayer2(input_size=784)
466 467
            loss = case3(v1, v2)
            loss.backward()
468 469
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
470

471 472 473 474 475 476
    def test_case2_prune_no_grad_branch(self):
        with _test_eager_guard():
            self.func_case2_prune_no_grad_branch()
        self.func_case2_prune_no_grad_branch()

    def func_case3_prune_no_grad_branch2(self):
477 478
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
479
            linear = paddle.nn.Linear(1, 1)
480
            label = fluid.dygraph.to_variable(value1).astype("float32")
481
            label = linear(label)
482 483 484
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
485
            loss = paddle.mean(out)
486
            loss.backward()
487
            self.assertIsNone(linear.weight._grad_ivar())
488

489 490 491 492 493 494
    def test_case3_prune_no_grad_branch2(self):
        with _test_eager_guard():
            self.func_case3_prune_no_grad_branch2()
        self.func_case3_prune_no_grad_branch2()

    def func_case4_with_no_grad_op_maker(self):
495
        with fluid.dygraph.guard():
496
            out = random.gaussian(shape=[20, 30])
497
            loss = paddle.mean(out)
498
            loss.backward()
499
            self.assertIsNone(out._grad_ivar())
500

501 502 503 504 505
    def test_case4_with_no_grad_op_maker(self):
        with _test_eager_guard():
            self.func_case4_with_no_grad_op_maker()
        self.func_case4_with_no_grad_op_maker()

506 507 508

if __name__ == '__main__':
    unittest.main()