test_imperative_auto_prune.py 16.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.nn import Embedding
22
from paddle.tensor import random
23 24 25


class AutoPruneLayer0(fluid.Layer):
26
    def __init__(self, input_size):
27
        super().__init__()
28
        self.linear1 = paddle.nn.Linear(
29
            input_size,
30
            5,
31 32 33
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
34 35
            bias_attr=False,
        )
36
        self.linear2 = paddle.nn.Linear(
37
            5,
38
            5,
39 40 41
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
42 43
            bias_attr=False,
        )
44 45

    def forward(self, x, y):
46 47
        a = self.linear1(x)
        b = self.linear2(y)
48
        c = fluid.layers.mul(a, b)
49
        d = paddle.mean(c)
50 51 52 53
        return d


class AutoPruneLayer1(fluid.Layer):
54
    def __init__(self, input_size):
55
        super().__init__()
56
        self.linear1 = paddle.nn.Linear(
57
            input_size,
58
            5,
59 60 61
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
62 63
            bias_attr=False,
        )
64
        self.linear2 = paddle.nn.Linear(
65
            5,
66
            5,
67 68 69
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
70 71
            bias_attr=False,
        )
72 73

    def forward(self, x, y):
74 75
        a = self.linear1(x)
        b = self.linear2(y)
76 77
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
78
        d = paddle.mean(c)
79 80 81 82
        return d


class AutoPruneLayer2(fluid.Layer):
83
    def __init__(self, input_size):
84
        super().__init__()
85 86
        self.linear = paddle.nn.Linear(input_size, 10)
        self.linear2 = paddle.nn.Linear(1, 1)
87 88

    def forward(self, x, label):
89 90
        feature = self.linear(x)
        label = self.linear2(label)
91 92
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
93 94 95 96
        # Note that the label is not persistable in paddle.nn.functional.cross_entropy.
        loss = paddle.nn.functional.cross_entropy(
            input=feature, label=label, reduction='none', use_softmax=False
        )
97
        loss = paddle.mean(loss)
98 99 100 101
        return loss


class AutoPruneLayer3(fluid.Layer):
102
    def __init__(self, input_size):
103
        super().__init__()
104
        self.linear = paddle.nn.Linear(input_size, 20)
105 106

    def forward(self, x, label, test_num):
107
        feature = self.linear(x)
108 109 110
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1
        )
111
        # Note that: part2 is not used.
112 113 114
        loss = paddle.nn.functional.cross_entropy(
            input=part1, label=label, reduction='none', use_softmax=False
        )
115
        loss = paddle.mean(loss)
116 117 118 119 120 121 122
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
123
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
124
        super().__init__(dtype=dtype)
125 126
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
127 128
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
129 130

    def forward(self, x):
131
        # this method involves only the linear layers
132
        loss = paddle.mean(self.linear_0(x) + self.linear_1(x))
133 134 135
        return loss

    def linear0(self, x):
136
        loss = paddle.mean(self.linear_0(x))
137 138 139
        return loss

    def embed_linear0(self, x):
140
        loss = paddle.mean(self.linear_0(self.embed0(x)))
141 142 143 144
        return loss


class MyLayer2(fluid.Layer):
145
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
146
        super().__init__(dtype=dtype)
147 148
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
149 150
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
151 152 153 154

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
155
        loss = paddle.mean(
156 157 158
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
159 160 161
        return loss

    def linear0(self, x):
162
        loss = paddle.mean(self.linear_0(x))
163 164 165
        return loss

    def embed_linear0(self, x):
166
        loss = paddle.mean(self.linear_0(self.embed0(x)))
167 168 169 170
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
171
    def test_auto_prune(self):
172
        with fluid.dygraph.guard():
173
            case1 = AutoPruneLayer0(input_size=5)
174 175 176 177 178 179
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
180 181
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
182

183
    def test_auto_prune2(self):
184
        with fluid.dygraph.guard():
185
            case2 = AutoPruneLayer1(input_size=5)
186 187 188 189 190
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
191

192
            loss.backward()
193 194
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
195

196
    # TODO(jiabin): Support this when we support better split tensor
197 198
    def test_auto_prune3(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
199
        with fluid.dygraph.guard():
200
            case3 = AutoPruneLayer3(input_size=784)
201 202 203 204 205 206
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
207
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
208
            self.assertTrue((part2.gradient() == 0).all())
209
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
210

211 212
    def test_auto_prune4(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
213
        with fluid.dygraph.guard():
214
            case4 = AutoPruneLayer3(input_size=784)
215 216 217 218 219 220
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
221
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
222
            self.assertTrue((part2.gradient() == 1).all())
223
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
224

225 226
    def test_auto_prune5(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
227
        with fluid.dygraph.guard():
228
            case4 = AutoPruneLayer3(input_size=784)
229 230 231 232 233 234
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
235
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
236
            self.assertTrue((part2.gradient() == 0).all())
237
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
238

239
    def test_auto_prune6(self):
240 241 242 243
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
244 245
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
246 247 248
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
249 250
            out1 = linear(a)
            out2 = linear2(b)
251 252 253
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
254 255
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
256

257
    def test_auto_prune7(self):
258 259 260 261
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
262 263
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
264 265 266
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
267 268
            out1 = linear(a)
            out2 = linear2(b)
269 270
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
271
            out.backward()
272 273
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
274

275
    def test_auto_prune8(self):
276 277 278 279
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
280 281
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
282 283 284
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
285 286 287 288 289
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
290
            out2.backward()
291 292
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
293 294
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
295
            optimizer.minimize(out2)
296 297 298
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
299
            self.assertFalse(
300 301
                np.array_equal(linear_origin, linear.weight.numpy())
            )
302

303
    def test_auto_prune9(self):
304 305 306 307
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
308 309
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
310 311 312
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
313 314 315 316
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
317 318
            out2.stop_gradient = True
            out2.backward()
319 320
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
321 322
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
323
            optimizer.minimize(out2)
324 325 326
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
327
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
328
            try:
329
                linear2.weight.gradient()
330 331 332
            except ValueError as e:
                assert type(e) == ValueError

333
    def test_auto_prune10(self):
334 335 336 337
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
338 339
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
340 341 342
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
343 344
            out1 = linear(a)
            out2 = linear2(b)
345 346
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
347
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
348 349
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
350 351
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
352

353
    def test_auto_prune_with_optimizer(self):
354 355 356 357
        vocab_size = 100
        size = 20
        batch_size = 16

358 359 360
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
361 362 363 364
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
365
            model = MyLayer(size, vocab_size, size)
366
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
367
            optimizer = fluid.optimizer.AdamOptimizer(
368 369
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
370
            indices = fluid.dygraph.to_variable(indices)
371
            embed = fluid.dygraph.to_variable(embed)
372 373 374 375
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
376
            _, params_grads = optimizer.minimize(loss)
377
            for items in params_grads:
378
                assert items[0].name is not model.embed1.weight.name
379
                assert items[0].name is not model.linear_1.weight.name
380
            assert model.embed1.weight._grad_ivar() is None
381
            assert model.linear_1.weight._grad_ivar() is None
382 383

        with fluid.dygraph.guard(place):
384
            model = MyLayer2(size, vocab_size, size)
385
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
386
            optimizer = fluid.optimizer.AdamOptimizer(
387 388
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
389 390 391 392 393 394 395

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
396
            optimizer.minimize(loss)
397
            for items in params_grads:
398
                assert items[0].name is not model.embed1.weight.name
399
                assert items[0].name is not model.linear_1.weight.name
400
            assert model.embed1.weight._grad_ivar() is None
401
            assert model.linear_1.weight._grad_ivar() is None
402

403
    def test_case2_prune_no_grad_branch(self):
404 405 406 407 408
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
409
            case3 = AutoPruneLayer2(input_size=784)
410 411
            loss = case3(v1, v2)
            loss.backward()
412 413
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
414

415
    def test_case3_prune_no_grad_branch2(self):
416 417
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
418
            linear = paddle.nn.Linear(1, 1)
419
            label = fluid.dygraph.to_variable(value1).astype("float32")
420
            label = linear(label)
421 422 423
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
424
            loss = paddle.mean(out)
425
            loss.backward()
426
            self.assertIsNone(linear.weight._grad_ivar())
427

428
    def test_case4_with_no_grad_op_maker(self):
429
        with fluid.dygraph.guard():
430
            out = random.gaussian(shape=[20, 30])
431
            loss = paddle.mean(out)
432
            loss.backward()
433
            self.assertIsNone(out._grad_ivar())
434 435 436 437


if __name__ == '__main__':
    unittest.main()