test_imperative_auto_prune.py 19.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import paddle.fluid as fluid
import numpy as np
18
from paddle.fluid.framework import _test_eager_guard
19 20 21


class AutoPruneLayer0(fluid.Layer):
22

23 24 25 26
    def __init__(self, input_size):
        super(AutoPruneLayer0, self).__init__()
        self.linear1 = fluid.dygraph.Linear(
            input_size,
27 28 29
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)
30 31
        self.linear2 = fluid.dygraph.Linear(
            5,
32 33 34 35 36
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)

    def forward(self, x, y):
37 38
        a = self.linear1(x)
        b = self.linear2(y)
39 40 41 42 43 44
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer1(fluid.Layer):
45

46 47 48 49
    def __init__(self, input_size):
        super(AutoPruneLayer1, self).__init__()
        self.linear1 = fluid.dygraph.Linear(
            input_size,
50 51 52
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)
53 54
        self.linear2 = fluid.dygraph.Linear(
            5,
55 56 57 58 59
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)

    def forward(self, x, y):
60 61
        a = self.linear1(x)
        b = self.linear2(y)
62 63 64 65 66 67 68
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer2(fluid.Layer):
69

70 71 72 73
    def __init__(self, input_size):
        super(AutoPruneLayer2, self).__init__()
        self.linear = fluid.dygraph.Linear(input_size, 10, act=None)
        self.linear2 = fluid.dygraph.Linear(1, 1, act=None)
74 75

    def forward(self, x, label):
76 77
        feature = self.linear(x)
        label = self.linear2(label)
78 79 80 81 82 83 84 85 86
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
        # Note that the label is not persistable in fluid.layers.cross_entropy.
        loss = fluid.layers.cross_entropy(input=feature, label=label)
        loss = fluid.layers.mean(loss)
        return loss


class AutoPruneLayer3(fluid.Layer):
87

88 89 90
    def __init__(self, input_size):
        super(AutoPruneLayer3, self).__init__()
        self.linear = fluid.dygraph.Linear(input_size, 20, act=None)
91 92

    def forward(self, x, label, test_num):
93
        feature = self.linear(x)
94 95 96
        part1, part2 = fluid.layers.split(feature,
                                          num_or_sections=[10, 10],
                                          dim=1)
97 98 99 100 101 102 103 104 105 106
        # Note that: part2 is not used.
        loss = fluid.layers.cross_entropy(input=part1, label=label)
        loss = fluid.layers.mean(loss)
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
107

108 109
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super(MyLayer, self).__init__(dtype=dtype)
110 111
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
112 113
        self.linear_0 = fluid.Linear(input_size, size, dtype=dtype)
        self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)
114 115

    def forward(self, x):
116 117
        # this method involves only the linear layers
        loss = fluid.layers.reduce_mean(self.linear_0(x) + self.linear_1(x))
118 119 120
        return loss

    def linear0(self, x):
121
        loss = fluid.layers.reduce_mean(self.linear_0(x))
122 123 124
        return loss

    def embed_linear0(self, x):
125
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
126 127 128 129
        return loss


class MyLayer2(fluid.Layer):
130

131 132
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
        super(MyLayer2, self).__init__(dtype=dtype)
133 134
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
135 136
        self.linear_0 = fluid.Linear(input_size, size, dtype=dtype)
        self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)
137 138 139 140 141

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
        loss = fluid.layers.reduce_mean(
142 143
            self.linear_0(self.embed0(indices)) +
            self.linear_1(self.embed1(indices)))
144 145 146
        return loss

    def linear0(self, x):
147
        loss = fluid.layers.reduce_mean(self.linear_0(x))
148 149 150
        return loss

    def embed_linear0(self, x):
151
        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
152 153 154 155
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
156

157
    def func_auto_prune(self):
158
        with fluid.dygraph.guard():
159
            case1 = AutoPruneLayer0(input_size=5)
160 161 162 163 164 165
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
166 167
            self.assertTrue(case1.linear2.weight._grad_ivar() is not None)
            self.assertTrue(case1.linear1.weight._grad_ivar() is not None)
168

169 170 171 172 173 174
    def test_auto_prune(self):
        with _test_eager_guard():
            self.func_auto_prune()
        self.func_auto_prune()

    def func_auto_prune2(self):
175
        with fluid.dygraph.guard():
176
            case2 = AutoPruneLayer1(input_size=5)
177 178 179 180 181
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
182

183
            loss.backward()
184 185
            self.assertTrue(case2.linear2.weight._grad_ivar() is None)
            self.assertTrue(case2.linear1.weight._grad_ivar() is not None)
186

187 188 189 190 191
    def test_auto_prune2(self):
        with _test_eager_guard():
            self.func_auto_prune2()
        self.func_auto_prune2()

192
    # TODO(jiabin): Support this when we support better split tensor
193
    def func_auto_prune3(self):
194
        with fluid.dygraph.guard():
195
            case3 = AutoPruneLayer3(input_size=784)
196 197 198 199 200 201
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
202
            self.assertTrue(case3.linear.weight._grad_ivar() is not None)
203 204
            self.assertTrue((part2.gradient() == 0).all())

205
    def test_auto_prune3(self):
206
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
207 208 209
        with _test_eager_guard():
            self.func_auto_prune3()
        self.func_auto_prune3()
210
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
211 212

    def func_auto_prune4(self):
213
        with fluid.dygraph.guard():
214
            case4 = AutoPruneLayer3(input_size=784)
215 216 217 218 219 220
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
221
            self.assertTrue(case4.linear.weight._grad_ivar() is not None)
222 223
            self.assertTrue((part2.gradient() == 1).all())

224
    def test_auto_prune4(self):
225
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
226 227 228
        with _test_eager_guard():
            self.func_auto_prune4()
        self.func_auto_prune4()
229
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
230 231

    def func_auto_prune5(self):
232
        with fluid.dygraph.guard():
233
            case4 = AutoPruneLayer3(input_size=784)
234 235 236 237 238 239
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
240
            self.assertTrue(case4.linear.weight._grad_ivar() is not None)
241 242
            self.assertTrue((part2.gradient() == 0).all())

243
    def test_auto_prune5(self):
244
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
245 246 247
        with _test_eager_guard():
            self.func_auto_prune5()
        self.func_auto_prune5()
248
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
249

250
    def func_auto_prune6(self):
251 252 253 254
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
255 256
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
257 258 259
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
260 261
            out1 = linear(a)
            out2 = linear2(b)
262 263 264
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
265
            self.assertTrue(linear.weight.gradient() is None)
266
            self.assertTrue(out1.gradient() is None)
267

268 269 270 271 272 273
    def test_auto_prune6(self):
        with _test_eager_guard():
            self.func_auto_prune6()
        self.func_auto_prune6()

    def func_auto_prune7(self):
274 275 276 277
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
278 279
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
280 281 282
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
283 284
            out1 = linear(a)
            out2 = linear2(b)
285 286
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
287
            out.backward()
288
            self.assertTrue(linear.weight.gradient() is None)
289
            self.assertTrue(out1.gradient() is None)
290

291 292 293 294 295 296
    def test_auto_prune7(self):
        with _test_eager_guard():
            self.func_auto_prune7()
        self.func_auto_prune7()

    def func_auto_prune8(self):
297 298 299 300
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
301 302
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(5, 3, dtype="float32")
303 304 305
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
306 307 308 309 310
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
311
            out2.backward()
312 313
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
314
                parameter_list=(linear.parameters() + linear2.parameters()))
315
            optimizer.minimize(out2)
316 317 318 319
            self.assertTrue(
                np.array_equal(linear2_origin, linear2.weight.numpy()))
            self.assertFalse(
                np.array_equal(linear_origin, linear.weight.numpy()))
320

321 322 323 324 325 326
    def test_auto_prune8(self):
        with _test_eager_guard():
            self.func_auto_prune8()
        self.func_auto_prune8()

    def func_auto_prune9(self):
327 328 329 330
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
331 332
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(5, 3, dtype="float32")
333 334 335
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
336 337 338 339
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
340 341
            out2.stop_gradient = True
            out2.backward()
342 343
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
344
                parameter_list=(linear.parameters() + linear2.parameters()))
345
            optimizer.minimize(out2)
346 347
            self.assertTrue(
                np.array_equal(linear2_origin, linear2.weight.numpy()))
348 349
            self.assertTrue(np.array_equal(linear_origin,
                                           linear.weight.numpy()))
350
            try:
351
                linear2.weight.gradient()
352 353 354
            except ValueError as e:
                assert type(e) == ValueError

355 356 357 358 359 360
    def test_auto_prune9(self):
        with _test_eager_guard():
            self.func_auto_prune9()
        self.func_auto_prune9()

    def func_auto_prune10(self):
361 362 363 364
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
365 366
            linear = fluid.Linear(13, 5, dtype="float32")
            linear2 = fluid.Linear(3, 3, dtype="float32")
367 368 369
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
370 371
            out1 = linear(a)
            out2 = linear2(b)
372 373
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
374
            #TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
375 376
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
377
            self.assertTrue(linear.weight.gradient() is None)
378
            self.assertTrue(out1.gradient() is None)
379

380 381 382 383 384 385
    def test_auto_prune10(self):
        with _test_eager_guard():
            self.func_auto_prune10()
        self.func_auto_prune10()

    def func_auto_prune_with_optimizer(self):
386 387 388 389
        vocab_size = 100
        size = 20
        batch_size = 16

390 391
        indices = np.random.randint(low=0, high=100,
                                    size=(batch_size, 1)).astype("int64")
392 393 394 395
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
396
            model = MyLayer(size, vocab_size, size)
397
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
398 399
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
400
            indices = fluid.dygraph.to_variable(indices)
401
            embed = fluid.dygraph.to_variable(embed)
402 403 404 405
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
406
            _, params_grads = optimizer.minimize(loss)
407
            for items in params_grads:
408
                assert items[0].name is not model.embed1.weight.name
409
                assert items[0].name is not model.linear_1.weight.name
410
            assert model.embed1.weight._grad_ivar() is None
411
            assert model.linear_1.weight._grad_ivar() is None
412 413

        with fluid.dygraph.guard(place):
414
            model = MyLayer2(size, vocab_size, size)
415
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
416 417
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
418 419 420 421 422 423 424

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
425
            optimizer.minimize(loss)
426
            for items in params_grads:
427
                assert items[0].name is not model.embed1.weight.name
428
                assert items[0].name is not model.linear_1.weight.name
429
            assert model.embed1.weight._grad_ivar() is None
430
            assert model.linear_1.weight._grad_ivar() is None
431

432 433 434 435 436 437
    def test_auto_prune_with_optimizer(self):
        with _test_eager_guard():
            self.func_auto_prune_with_optimizer()
        self.func_auto_prune_with_optimizer()

    def func_case2_prune_no_grad_branch(self):
438 439 440 441 442
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
443
            case3 = AutoPruneLayer2(input_size=784)
444 445
            loss = case3(v1, v2)
            loss.backward()
446 447
            self.assertTrue(case3.linear2.weight._grad_ivar() is None)
            self.assertTrue(case3.linear.weight._grad_ivar() is not None)
448

449 450 451 452 453 454
    def test_case2_prune_no_grad_branch(self):
        with _test_eager_guard():
            self.func_case2_prune_no_grad_branch()
        self.func_case2_prune_no_grad_branch()

    def func_case3_prune_no_grad_branch2(self):
455 456
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
457
            linear = fluid.dygraph.Linear(1, 1, act=None)
458
            label = fluid.dygraph.to_variable(value1).astype("float32")
459
            label = linear(label)
460 461 462 463 464
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
            loss = fluid.layers.mean(out)
            loss.backward()
465
            self.assertTrue(linear.weight._grad_ivar() is None)
466

467 468 469 470 471 472
    def test_case3_prune_no_grad_branch2(self):
        with _test_eager_guard():
            self.func_case3_prune_no_grad_branch2()
        self.func_case3_prune_no_grad_branch2()

    def func_case4_with_no_grad_op_maker(self):
473 474 475 476
        with fluid.dygraph.guard():
            out = fluid.layers.gaussian_random(shape=[20, 30])
            loss = fluid.layers.mean(out)
            loss.backward()
477
            self.assertTrue(out._grad_ivar() is None)
478

479 480 481 482 483
    def test_case4_with_no_grad_op_maker(self):
        with _test_eager_guard():
            self.func_case4_with_no_grad_op_maker()
        self.func_case4_with_no_grad_op_maker()

484 485 486

if __name__ == '__main__':
    unittest.main()