test_imperative_auto_prune.py 18.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.fluid.framework import _test_eager_guard
22 23 24


class AutoPruneLayer0(fluid.Layer):
25
    def __init__(self, input_size):
26
        super().__init__()
27
        self.linear1 = paddle.nn.Linear(
28
            input_size,
29
            5,
30 31 32
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
33 34
            bias_attr=False,
        )
35
        self.linear2 = paddle.nn.Linear(
36
            5,
37
            5,
38 39 40
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
41 42
            bias_attr=False,
        )
43 44

    def forward(self, x, y):
45 46
        a = self.linear1(x)
        b = self.linear2(y)
47 48 49 50 51 52
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer1(fluid.Layer):
53
    def __init__(self, input_size):
54
        super().__init__()
55
        self.linear1 = paddle.nn.Linear(
56
            input_size,
57
            5,
58 59 60
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
61 62
            bias_attr=False,
        )
63
        self.linear2 = paddle.nn.Linear(
64
            5,
65
            5,
66 67 68
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
69 70
            bias_attr=False,
        )
71 72

    def forward(self, x, y):
73 74
        a = self.linear1(x)
        b = self.linear2(y)
75 76 77 78 79 80 81
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer2(fluid.Layer):
82
    def __init__(self, input_size):
83
        super().__init__()
84 85
        self.linear = paddle.nn.Linear(input_size, 10)
        self.linear2 = paddle.nn.Linear(1, 1)
86 87

    def forward(self, x, label):
88 89
        feature = self.linear(x)
        label = self.linear2(label)
90 91 92 93
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
        # Note that the label is not persistable in fluid.layers.cross_entropy.
        loss = fluid.layers.cross_entropy(input=feature, label=label)
94
        loss = paddle.mean(loss)
95 96 97 98
        return loss


class AutoPruneLayer3(fluid.Layer):
99
    def __init__(self, input_size):
100
        super().__init__()
101
        self.linear = paddle.nn.Linear(input_size, 20)
102 103

    def forward(self, x, label, test_num):
104
        feature = self.linear(x)
105 106 107
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1
        )
108 109
        # Note that: part2 is not used.
        loss = fluid.layers.cross_entropy(input=part1, label=label)
110
        loss = paddle.mean(loss)
111 112 113 114 115 116 117
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
118
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
119
        super().__init__(dtype=dtype)
120 121
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
122 123
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
124 125

    def forward(self, x):
126 127
        # this method involves only the linear layers
        loss = fluid.layers.reduce_mean(self.linear_0(x) + self.linear_1(x))
128 129 130
        return loss

    def linear0(self, x):
131
        loss = fluid.layers.reduce_mean(self.linear_0(x))
132 133 134
        return loss

    def embed_linear0(self, x):
135
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
136 137 138 139
        return loss


class MyLayer2(fluid.Layer):
140
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
141
        super().__init__(dtype=dtype)
142 143
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
144 145
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
146 147 148 149 150

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
        loss = fluid.layers.reduce_mean(
151 152 153
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
154 155 156
        return loss

    def linear0(self, x):
157
        loss = fluid.layers.reduce_mean(self.linear_0(x))
158 159 160
        return loss

    def embed_linear0(self, x):
161
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
162 163 164 165
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
166
    def func_auto_prune(self):
167
        with fluid.dygraph.guard():
168
            case1 = AutoPruneLayer0(input_size=5)
169 170 171 172 173 174
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
175 176
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
177

178 179 180 181 182 183
    def test_auto_prune(self):
        with _test_eager_guard():
            self.func_auto_prune()
        self.func_auto_prune()

    def func_auto_prune2(self):
184
        with fluid.dygraph.guard():
185
            case2 = AutoPruneLayer1(input_size=5)
186 187 188 189 190
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
191

192
            loss.backward()
193 194
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
195

196 197 198 199 200
    def test_auto_prune2(self):
        with _test_eager_guard():
            self.func_auto_prune2()
        self.func_auto_prune2()

201
    # TODO(jiabin): Support this when we support better split tensor
202
    def func_auto_prune3(self):
203
        with fluid.dygraph.guard():
204
            case3 = AutoPruneLayer3(input_size=784)
205 206 207 208 209 210
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
211
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
212 213
            self.assertTrue((part2.gradient() == 0).all())

214
    def test_auto_prune3(self):
215
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
216 217 218
        with _test_eager_guard():
            self.func_auto_prune3()
        self.func_auto_prune3()
219
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
220 221

    def func_auto_prune4(self):
222
        with fluid.dygraph.guard():
223
            case4 = AutoPruneLayer3(input_size=784)
224 225 226 227 228 229
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
230
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
231 232
            self.assertTrue((part2.gradient() == 1).all())

233
    def test_auto_prune4(self):
234
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
235 236 237
        with _test_eager_guard():
            self.func_auto_prune4()
        self.func_auto_prune4()
238
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
239 240

    def func_auto_prune5(self):
241
        with fluid.dygraph.guard():
242
            case4 = AutoPruneLayer3(input_size=784)
243 244 245 246 247 248
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
249
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
250 251
            self.assertTrue((part2.gradient() == 0).all())

252
    def test_auto_prune5(self):
253
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
254 255 256
        with _test_eager_guard():
            self.func_auto_prune5()
        self.func_auto_prune5()
257
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
258

259
    def func_auto_prune6(self):
260 261 262 263
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
264 265
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
266 267 268
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
269 270
            out1 = linear(a)
            out2 = linear2(b)
271 272 273
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
274 275
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
276

277 278 279 280 281 282
    def test_auto_prune6(self):
        with _test_eager_guard():
            self.func_auto_prune6()
        self.func_auto_prune6()

    def func_auto_prune7(self):
283 284 285 286
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
287 288
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
289 290 291
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
292 293
            out1 = linear(a)
            out2 = linear2(b)
294 295
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
296
            out.backward()
297 298
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
299

300 301 302 303 304 305
    def test_auto_prune7(self):
        with _test_eager_guard():
            self.func_auto_prune7()
        self.func_auto_prune7()

    def func_auto_prune8(self):
306 307 308 309
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
310 311
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
312 313 314
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
315 316 317 318 319
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
320
            out2.backward()
321 322
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
323 324
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
325
            optimizer.minimize(out2)
326 327 328
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
329
            self.assertFalse(
330 331
                np.array_equal(linear_origin, linear.weight.numpy())
            )
332

333 334 335 336 337 338
    def test_auto_prune8(self):
        with _test_eager_guard():
            self.func_auto_prune8()
        self.func_auto_prune8()

    def func_auto_prune9(self):
339 340 341 342
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
343 344
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
345 346 347
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
348 349 350 351
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
352 353
            out2.stop_gradient = True
            out2.backward()
354 355
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
356 357
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
358
            optimizer.minimize(out2)
359 360 361
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
362
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
363
            try:
364
                linear2.weight.gradient()
365 366 367
            except ValueError as e:
                assert type(e) == ValueError

368 369 370 371 372 373
    def test_auto_prune9(self):
        with _test_eager_guard():
            self.func_auto_prune9()
        self.func_auto_prune9()

    def func_auto_prune10(self):
374 375 376 377
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
378 379
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
380 381 382
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
383 384
            out1 = linear(a)
            out2 = linear2(b)
385 386
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
387
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
388 389
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
390 391
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
392

393 394 395 396 397 398
    def test_auto_prune10(self):
        with _test_eager_guard():
            self.func_auto_prune10()
        self.func_auto_prune10()

    def func_auto_prune_with_optimizer(self):
399 400 401 402
        vocab_size = 100
        size = 20
        batch_size = 16

403 404 405
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
406 407 408 409
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
410
            model = MyLayer(size, vocab_size, size)
411
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
412
            optimizer = fluid.optimizer.AdamOptimizer(
413 414
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
415
            indices = fluid.dygraph.to_variable(indices)
416
            embed = fluid.dygraph.to_variable(embed)
417 418 419 420
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
421
            _, params_grads = optimizer.minimize(loss)
422
            for items in params_grads:
423
                assert items[0].name is not model.embed1.weight.name
424
                assert items[0].name is not model.linear_1.weight.name
425
            assert model.embed1.weight._grad_ivar() is None
426
            assert model.linear_1.weight._grad_ivar() is None
427 428

        with fluid.dygraph.guard(place):
429
            model = MyLayer2(size, vocab_size, size)
430
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
431
            optimizer = fluid.optimizer.AdamOptimizer(
432 433
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
434 435 436 437 438 439 440

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
441
            optimizer.minimize(loss)
442
            for items in params_grads:
443
                assert items[0].name is not model.embed1.weight.name
444
                assert items[0].name is not model.linear_1.weight.name
445
            assert model.embed1.weight._grad_ivar() is None
446
            assert model.linear_1.weight._grad_ivar() is None
447

448 449 450 451 452 453
    def test_auto_prune_with_optimizer(self):
        with _test_eager_guard():
            self.func_auto_prune_with_optimizer()
        self.func_auto_prune_with_optimizer()

    def func_case2_prune_no_grad_branch(self):
454 455 456 457 458
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
459
            case3 = AutoPruneLayer2(input_size=784)
460 461
            loss = case3(v1, v2)
            loss.backward()
462 463
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
464

465 466 467 468 469 470
    def test_case2_prune_no_grad_branch(self):
        with _test_eager_guard():
            self.func_case2_prune_no_grad_branch()
        self.func_case2_prune_no_grad_branch()

    def func_case3_prune_no_grad_branch2(self):
471 472
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
473
            linear = paddle.nn.Linear(1, 1)
474
            label = fluid.dygraph.to_variable(value1).astype("float32")
475
            label = linear(label)
476 477 478
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
479
            loss = paddle.mean(out)
480
            loss.backward()
481
            self.assertIsNone(linear.weight._grad_ivar())
482

483 484 485 486 487 488
    def test_case3_prune_no_grad_branch2(self):
        with _test_eager_guard():
            self.func_case3_prune_no_grad_branch2()
        self.func_case3_prune_no_grad_branch2()

    def func_case4_with_no_grad_op_maker(self):
489 490
        with fluid.dygraph.guard():
            out = fluid.layers.gaussian_random(shape=[20, 30])
491
            loss = paddle.mean(out)
492
            loss.backward()
493
            self.assertIsNone(out._grad_ivar())
494

495 496 497 498 499
    def test_case4_with_no_grad_op_maker(self):
        with _test_eager_guard():
            self.func_case4_with_no_grad_op_maker()
        self.func_case4_with_no_grad_op_maker()

500 501 502

if __name__ == '__main__':
    unittest.main()