test_imperative_auto_prune.py 18.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.fluid.framework import _test_eager_guard
22
from paddle.tensor import random
23 24 25


class AutoPruneLayer0(fluid.Layer):
26
    def __init__(self, input_size):
27
        super().__init__()
28
        self.linear1 = paddle.nn.Linear(
29
            input_size,
30
            5,
31 32 33
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
34 35
            bias_attr=False,
        )
36
        self.linear2 = paddle.nn.Linear(
37
            5,
38
            5,
39 40 41
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
42 43
            bias_attr=False,
        )
44 45

    def forward(self, x, y):
46 47
        a = self.linear1(x)
        b = self.linear2(y)
48
        c = fluid.layers.mul(a, b)
49
        d = paddle.mean(c)
50 51 52 53
        return d


class AutoPruneLayer1(fluid.Layer):
54
    def __init__(self, input_size):
55
        super().__init__()
56
        self.linear1 = paddle.nn.Linear(
57
            input_size,
58
            5,
59 60 61
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
62 63
            bias_attr=False,
        )
64
        self.linear2 = paddle.nn.Linear(
65
            5,
66
            5,
67 68 69
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
70 71
            bias_attr=False,
        )
72 73

    def forward(self, x, y):
74 75
        a = self.linear1(x)
        b = self.linear2(y)
76 77
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
78
        d = paddle.mean(c)
79 80 81 82
        return d


class AutoPruneLayer2(fluid.Layer):
83
    def __init__(self, input_size):
84
        super().__init__()
85 86
        self.linear = paddle.nn.Linear(input_size, 10)
        self.linear2 = paddle.nn.Linear(1, 1)
87 88

    def forward(self, x, label):
89 90
        feature = self.linear(x)
        label = self.linear2(label)
91 92 93 94
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
        # Note that the label is not persistable in fluid.layers.cross_entropy.
        loss = fluid.layers.cross_entropy(input=feature, label=label)
95
        loss = paddle.mean(loss)
96 97 98 99
        return loss


class AutoPruneLayer3(fluid.Layer):
100
    def __init__(self, input_size):
101
        super().__init__()
102
        self.linear = paddle.nn.Linear(input_size, 20)
103 104

    def forward(self, x, label, test_num):
105
        feature = self.linear(x)
106 107 108
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1
        )
109 110
        # Note that: part2 is not used.
        loss = fluid.layers.cross_entropy(input=part1, label=label)
111
        loss = paddle.mean(loss)
112 113 114 115 116 117 118
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
119
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
120
        super().__init__(dtype=dtype)
121 122
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
123 124
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
125 126

    def forward(self, x):
127
        # this method involves only the linear layers
128
        loss = paddle.mean(self.linear_0(x) + self.linear_1(x))
129 130 131
        return loss

    def linear0(self, x):
132
        loss = paddle.mean(self.linear_0(x))
133 134 135
        return loss

    def embed_linear0(self, x):
136
        loss = paddle.mean(self.linear_0(self.embed0(x)))
137 138 139 140
        return loss


class MyLayer2(fluid.Layer):
141
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
142
        super().__init__(dtype=dtype)
143 144
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
145 146
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
147 148 149 150

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
151
        loss = paddle.mean(
152 153 154
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
155 156 157
        return loss

    def linear0(self, x):
158
        loss = paddle.mean(self.linear_0(x))
159 160 161
        return loss

    def embed_linear0(self, x):
162
        loss = paddle.mean(self.linear_0(self.embed0(x)))
163 164 165 166
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
167
    def func_auto_prune(self):
168
        with fluid.dygraph.guard():
169
            case1 = AutoPruneLayer0(input_size=5)
170 171 172 173 174 175
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
176 177
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
178

179 180 181 182 183 184
    def test_auto_prune(self):
        with _test_eager_guard():
            self.func_auto_prune()
        self.func_auto_prune()

    def func_auto_prune2(self):
185
        with fluid.dygraph.guard():
186
            case2 = AutoPruneLayer1(input_size=5)
187 188 189 190 191
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
192

193
            loss.backward()
194 195
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
196

197 198 199 200 201
    def test_auto_prune2(self):
        with _test_eager_guard():
            self.func_auto_prune2()
        self.func_auto_prune2()

202
    # TODO(jiabin): Support this when we support better split tensor
203
    def func_auto_prune3(self):
204
        with fluid.dygraph.guard():
205
            case3 = AutoPruneLayer3(input_size=784)
206 207 208 209 210 211
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
212
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
213 214
            self.assertTrue((part2.gradient() == 0).all())

215
    def test_auto_prune3(self):
216
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
217 218 219
        with _test_eager_guard():
            self.func_auto_prune3()
        self.func_auto_prune3()
220
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
221 222

    def func_auto_prune4(self):
223
        with fluid.dygraph.guard():
224
            case4 = AutoPruneLayer3(input_size=784)
225 226 227 228 229 230
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
231
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
232 233
            self.assertTrue((part2.gradient() == 1).all())

234
    def test_auto_prune4(self):
235
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
236 237 238
        with _test_eager_guard():
            self.func_auto_prune4()
        self.func_auto_prune4()
239
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
240 241

    def func_auto_prune5(self):
242
        with fluid.dygraph.guard():
243
            case4 = AutoPruneLayer3(input_size=784)
244 245 246 247 248 249
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
250
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
251 252
            self.assertTrue((part2.gradient() == 0).all())

253
    def test_auto_prune5(self):
254
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
255 256 257
        with _test_eager_guard():
            self.func_auto_prune5()
        self.func_auto_prune5()
258
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
259

260
    def func_auto_prune6(self):
261 262 263 264
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
265 266
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
267 268 269
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
270 271
            out1 = linear(a)
            out2 = linear2(b)
272 273 274
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
275 276
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
277

278 279 280 281 282 283
    def test_auto_prune6(self):
        with _test_eager_guard():
            self.func_auto_prune6()
        self.func_auto_prune6()

    def func_auto_prune7(self):
284 285 286 287
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
288 289
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
290 291 292
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
293 294
            out1 = linear(a)
            out2 = linear2(b)
295 296
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
297
            out.backward()
298 299
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
300

301 302 303 304 305 306
    def test_auto_prune7(self):
        with _test_eager_guard():
            self.func_auto_prune7()
        self.func_auto_prune7()

    def func_auto_prune8(self):
307 308 309 310
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
311 312
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
313 314 315
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
316 317 318 319 320
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
321
            out2.backward()
322 323
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
324 325
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
326
            optimizer.minimize(out2)
327 328 329
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
330
            self.assertFalse(
331 332
                np.array_equal(linear_origin, linear.weight.numpy())
            )
333

334 335 336 337 338 339
    def test_auto_prune8(self):
        with _test_eager_guard():
            self.func_auto_prune8()
        self.func_auto_prune8()

    def func_auto_prune9(self):
340 341 342 343
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
344 345
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
346 347 348
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
349 350 351 352
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
353 354
            out2.stop_gradient = True
            out2.backward()
355 356
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
357 358
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
359
            optimizer.minimize(out2)
360 361 362
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
363
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
364
            try:
365
                linear2.weight.gradient()
366 367 368
            except ValueError as e:
                assert type(e) == ValueError

369 370 371 372 373 374
    def test_auto_prune9(self):
        with _test_eager_guard():
            self.func_auto_prune9()
        self.func_auto_prune9()

    def func_auto_prune10(self):
375 376 377 378
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
379 380
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
381 382 383
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
384 385
            out1 = linear(a)
            out2 = linear2(b)
386 387
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
388
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
389 390
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
391 392
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
393

394 395 396 397 398 399
    def test_auto_prune10(self):
        with _test_eager_guard():
            self.func_auto_prune10()
        self.func_auto_prune10()

    def func_auto_prune_with_optimizer(self):
400 401 402 403
        vocab_size = 100
        size = 20
        batch_size = 16

404 405 406
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
407 408 409 410
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
411
            model = MyLayer(size, vocab_size, size)
412
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
413
            optimizer = fluid.optimizer.AdamOptimizer(
414 415
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
416
            indices = fluid.dygraph.to_variable(indices)
417
            embed = fluid.dygraph.to_variable(embed)
418 419 420 421
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
422
            _, params_grads = optimizer.minimize(loss)
423
            for items in params_grads:
424
                assert items[0].name is not model.embed1.weight.name
425
                assert items[0].name is not model.linear_1.weight.name
426
            assert model.embed1.weight._grad_ivar() is None
427
            assert model.linear_1.weight._grad_ivar() is None
428 429

        with fluid.dygraph.guard(place):
430
            model = MyLayer2(size, vocab_size, size)
431
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
432
            optimizer = fluid.optimizer.AdamOptimizer(
433 434
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
435 436 437 438 439 440 441

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
442
            optimizer.minimize(loss)
443
            for items in params_grads:
444
                assert items[0].name is not model.embed1.weight.name
445
                assert items[0].name is not model.linear_1.weight.name
446
            assert model.embed1.weight._grad_ivar() is None
447
            assert model.linear_1.weight._grad_ivar() is None
448

449 450 451 452 453 454
    def test_auto_prune_with_optimizer(self):
        with _test_eager_guard():
            self.func_auto_prune_with_optimizer()
        self.func_auto_prune_with_optimizer()

    def func_case2_prune_no_grad_branch(self):
455 456 457 458 459
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
460
            case3 = AutoPruneLayer2(input_size=784)
461 462
            loss = case3(v1, v2)
            loss.backward()
463 464
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
465

466 467 468 469 470 471
    def test_case2_prune_no_grad_branch(self):
        with _test_eager_guard():
            self.func_case2_prune_no_grad_branch()
        self.func_case2_prune_no_grad_branch()

    def func_case3_prune_no_grad_branch2(self):
472 473
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
474
            linear = paddle.nn.Linear(1, 1)
475
            label = fluid.dygraph.to_variable(value1).astype("float32")
476
            label = linear(label)
477 478 479
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
480
            loss = paddle.mean(out)
481
            loss.backward()
482
            self.assertIsNone(linear.weight._grad_ivar())
483

484 485 486 487 488 489
    def test_case3_prune_no_grad_branch2(self):
        with _test_eager_guard():
            self.func_case3_prune_no_grad_branch2()
        self.func_case3_prune_no_grad_branch2()

    def func_case4_with_no_grad_op_maker(self):
490
        with fluid.dygraph.guard():
491
            out = random.gaussian(shape=[20, 30])
492
            loss = paddle.mean(out)
493
            loss.backward()
494
            self.assertIsNone(out._grad_ivar())
495

496 497 498 499 500
    def test_case4_with_no_grad_op_maker(self):
        with _test_eager_guard():
            self.func_case4_with_no_grad_op_maker()
        self.func_case4_with_no_grad_op_maker()

501 502 503

if __name__ == '__main__':
    unittest.main()