test_imperative_auto_prune.py 18.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.fluid.framework import _test_eager_guard
22 23 24


class AutoPruneLayer0(fluid.Layer):
25
    def __init__(self, input_size):
26
        super().__init__()
27 28
        self.linear1 = fluid.dygraph.Linear(
            input_size,
29 30
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
31 32
            bias_attr=False,
        )
33 34
        self.linear2 = fluid.dygraph.Linear(
            5,
35 36
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
37 38
            bias_attr=False,
        )
39 40

    def forward(self, x, y):
41 42
        a = self.linear1(x)
        b = self.linear2(y)
43 44 45 46 47 48
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer1(fluid.Layer):
49
    def __init__(self, input_size):
50
        super().__init__()
51 52
        self.linear1 = fluid.dygraph.Linear(
            input_size,
53 54
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
55 56
            bias_attr=False,
        )
57 58
        self.linear2 = fluid.dygraph.Linear(
            5,
59 60
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
61 62
            bias_attr=False,
        )
63 64

    def forward(self, x, y):
65 66
        a = self.linear1(x)
        b = self.linear2(y)
67 68 69 70 71 72 73
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer2(fluid.Layer):
74
    def __init__(self, input_size):
75
        super().__init__()
76 77
        self.linear = fluid.dygraph.Linear(input_size, 10, act=None)
        self.linear2 = fluid.dygraph.Linear(1, 1, act=None)
78 79

    def forward(self, x, label):
80 81
        feature = self.linear(x)
        label = self.linear2(label)
82 83 84 85
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
        # Note that the label is not persistable in fluid.layers.cross_entropy.
        loss = fluid.layers.cross_entropy(input=feature, label=label)
86
        loss = paddle.mean(loss)
87 88 89 90
        return loss


class AutoPruneLayer3(fluid.Layer):
91
    def __init__(self, input_size):
92
        super().__init__()
93
        self.linear = fluid.dygraph.Linear(input_size, 20, act=None)
94 95

    def forward(self, x, label, test_num):
96
        feature = self.linear(x)
97 98 99
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1
        )
100 101
        # Note that: part2 is not used.
        loss = fluid.layers.cross_entropy(input=part1, label=label)
102
        loss = paddle.mean(loss)
103 104 105 106 107 108 109
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
110
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
111
        super().__init__(dtype=dtype)
112 113
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
114 115
        self.linear_0 = fluid.Linear(input_size, size, dtype=dtype)
        self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)
116 117

    def forward(self, x):
118 119
        # this method involves only the linear layers
        loss = fluid.layers.reduce_mean(self.linear_0(x) + self.linear_1(x))
120 121 122
        return loss

    def linear0(self, x):
123
        loss = fluid.layers.reduce_mean(self.linear_0(x))
124 125 126
        return loss

    def embed_linear0(self, x):
127
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
128 129 130 131
        return loss


class MyLayer2(fluid.Layer):
132
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
133
        super().__init__(dtype=dtype)
134 135
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
136 137
        self.linear_0 = fluid.Linear(input_size, size, dtype=dtype)
        self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)
138 139 140 141 142

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
        loss = fluid.layers.reduce_mean(
143 144 145
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
146 147 148
        return loss

    def linear0(self, x):
149
        loss = fluid.layers.reduce_mean(self.linear_0(x))
150 151 152
        return loss

    def embed_linear0(self, x):
153
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
154 155 156 157
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
158
    def func_auto_prune(self):
159
        with fluid.dygraph.guard():
160
            case1 = AutoPruneLayer0(input_size=5)
161 162 163 164 165 166
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
167 168
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
169

170 171 172 173 174 175
    def test_auto_prune(self):
        with _test_eager_guard():
            self.func_auto_prune()
        self.func_auto_prune()

    def func_auto_prune2(self):
176
        with fluid.dygraph.guard():
177
            case2 = AutoPruneLayer1(input_size=5)
178 179 180 181 182
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
183

184
            loss.backward()
185 186
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
187

188 189 190 191 192
    def test_auto_prune2(self):
        with _test_eager_guard():
            self.func_auto_prune2()
        self.func_auto_prune2()

193
    # TODO(jiabin): Support this when we support better split tensor
194
    def func_auto_prune3(self):
195
        with fluid.dygraph.guard():
196
            case3 = AutoPruneLayer3(input_size=784)
197 198 199 200 201 202
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
203
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
204 205
            self.assertTrue((part2.gradient() == 0).all())

206
    def test_auto_prune3(self):
207
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
208 209 210
        with _test_eager_guard():
            self.func_auto_prune3()
        self.func_auto_prune3()
211
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
212 213

    def func_auto_prune4(self):
214
        with fluid.dygraph.guard():
215
            case4 = AutoPruneLayer3(input_size=784)
216 217 218 219 220 221
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
222
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
223 224
            self.assertTrue((part2.gradient() == 1).all())

225
    def test_auto_prune4(self):
226
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
227 228 229
        with _test_eager_guard():
            self.func_auto_prune4()
        self.func_auto_prune4()
230
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
231 232

    def func_auto_prune5(self):
233
        with fluid.dygraph.guard():
234
            case4 = AutoPruneLayer3(input_size=784)
235 236 237 238 239 240
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
241
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
242 243
            self.assertTrue((part2.gradient() == 0).all())

244
    def test_auto_prune5(self):
245
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
246 247 248
        with _test_eager_guard():
            self.func_auto_prune5()
        self.func_auto_prune5()
249
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
250

251
    def func_auto_prune6(self):
252 253 254 255
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
256 257
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
258 259 260
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
261 262
            out1 = linear(a)
            out2 = linear2(b)
263 264 265
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
266 267
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
268

269 270 271 272 273 274
    def test_auto_prune6(self):
        with _test_eager_guard():
            self.func_auto_prune6()
        self.func_auto_prune6()

    def func_auto_prune7(self):
275 276 277 278
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
279 280
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
281 282 283
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
284 285
            out1 = linear(a)
            out2 = linear2(b)
286 287
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
288
            out.backward()
289 290
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
291

292 293 294 295 296 297
    def test_auto_prune7(self):
        with _test_eager_guard():
            self.func_auto_prune7()
        self.func_auto_prune7()

    def func_auto_prune8(self):
298 299 300 301
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
302 303
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(5, 3, dtype="float32")
304 305 306
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
307 308 309 310 311
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
312
            out2.backward()
313 314
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
315 316
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
317
            optimizer.minimize(out2)
318 319 320
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
321
            self.assertFalse(
322 323
                np.array_equal(linear_origin, linear.weight.numpy())
            )
324

325 326 327 328 329 330
    def test_auto_prune8(self):
        with _test_eager_guard():
            self.func_auto_prune8()
        self.func_auto_prune8()

    def func_auto_prune9(self):
331 332 333 334
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
335 336
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(5, 3, dtype="float32")
337 338 339
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
340 341 342 343
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
344 345
            out2.stop_gradient = True
            out2.backward()
346 347
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
348 349
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
350
            optimizer.minimize(out2)
351 352 353
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
354
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
355
            try:
356
                linear2.weight.gradient()
357 358 359
            except ValueError as e:
                assert type(e) == ValueError

360 361 362 363 364 365
    def test_auto_prune9(self):
        with _test_eager_guard():
            self.func_auto_prune9()
        self.func_auto_prune9()

    def func_auto_prune10(self):
366 367 368 369
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
370 371
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
372 373 374
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
375 376
            out1 = linear(a)
            out2 = linear2(b)
377 378
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
379
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
380 381
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
382 383
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
384

385 386 387 388 389 390
    def test_auto_prune10(self):
        with _test_eager_guard():
            self.func_auto_prune10()
        self.func_auto_prune10()

    def func_auto_prune_with_optimizer(self):
391 392 393 394
        vocab_size = 100
        size = 20
        batch_size = 16

395 396 397
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
398 399 400 401
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
402
            model = MyLayer(size, vocab_size, size)
403
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
404
            optimizer = fluid.optimizer.AdamOptimizer(
405 406
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
407
            indices = fluid.dygraph.to_variable(indices)
408
            embed = fluid.dygraph.to_variable(embed)
409 410 411 412
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
413
            _, params_grads = optimizer.minimize(loss)
414
            for items in params_grads:
415
                assert items[0].name is not model.embed1.weight.name
416
                assert items[0].name is not model.linear_1.weight.name
417
            assert model.embed1.weight._grad_ivar() is None
418
            assert model.linear_1.weight._grad_ivar() is None
419 420

        with fluid.dygraph.guard(place):
421
            model = MyLayer2(size, vocab_size, size)
422
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
423
            optimizer = fluid.optimizer.AdamOptimizer(
424 425
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
426 427 428 429 430 431 432

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
433
            optimizer.minimize(loss)
434
            for items in params_grads:
435
                assert items[0].name is not model.embed1.weight.name
436
                assert items[0].name is not model.linear_1.weight.name
437
            assert model.embed1.weight._grad_ivar() is None
438
            assert model.linear_1.weight._grad_ivar() is None
439

440 441 442 443 444 445
    def test_auto_prune_with_optimizer(self):
        with _test_eager_guard():
            self.func_auto_prune_with_optimizer()
        self.func_auto_prune_with_optimizer()

    def func_case2_prune_no_grad_branch(self):
446 447 448 449 450
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
451
            case3 = AutoPruneLayer2(input_size=784)
452 453
            loss = case3(v1, v2)
            loss.backward()
454 455
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
456

457 458 459 460 461 462
    def test_case2_prune_no_grad_branch(self):
        with _test_eager_guard():
            self.func_case2_prune_no_grad_branch()
        self.func_case2_prune_no_grad_branch()

    def func_case3_prune_no_grad_branch2(self):
463 464
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
465
            linear = fluid.dygraph.Linear(1, 1, act=None)
466
            label = fluid.dygraph.to_variable(value1).astype("float32")
467
            label = linear(label)
468 469 470
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
471
            loss = paddle.mean(out)
472
            loss.backward()
473
            self.assertIsNone(linear.weight._grad_ivar())
474

475 476 477 478 479 480
    def test_case3_prune_no_grad_branch2(self):
        with _test_eager_guard():
            self.func_case3_prune_no_grad_branch2()
        self.func_case3_prune_no_grad_branch2()

    def func_case4_with_no_grad_op_maker(self):
481 482
        with fluid.dygraph.guard():
            out = fluid.layers.gaussian_random(shape=[20, 30])
483
            loss = paddle.mean(out)
484
            loss.backward()
485
            self.assertIsNone(out._grad_ivar())
486

487 488 489 490 491
    def test_case4_with_no_grad_op_maker(self):
        with _test_eager_guard():
            self.func_case4_with_no_grad_op_maker()
        self.func_case4_with_no_grad_op_maker()

492 493 494

if __name__ == '__main__':
    unittest.main()