test_imperative_auto_prune.py 16.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16 17 18

import numpy as np

19
import paddle
20
import paddle.fluid as fluid
21
from paddle.nn import Embedding
22
from paddle.tensor import random
23 24 25


class AutoPruneLayer0(fluid.Layer):
26
    def __init__(self, input_size):
27
        super().__init__()
28
        self.linear1 = paddle.nn.Linear(
29
            input_size,
30
            5,
31 32 33
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
34 35
            bias_attr=False,
        )
36
        self.linear2 = paddle.nn.Linear(
37
            5,
38
            5,
39 40 41
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
42 43
            bias_attr=False,
        )
44 45

    def forward(self, x, y):
46 47
        a = self.linear1(x)
        b = self.linear2(y)
Z
zqw_1997 已提交
48
        c = paddle.matmul(a, b)
49
        d = paddle.mean(c)
50 51 52 53
        return d


class AutoPruneLayer1(fluid.Layer):
54
    def __init__(self, input_size):
55
        super().__init__()
56
        self.linear1 = paddle.nn.Linear(
57
            input_size,
58
            5,
59 60 61
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
62 63
            bias_attr=False,
        )
64
        self.linear2 = paddle.nn.Linear(
65
            5,
66
            5,
67 68 69
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Constant(value=2)
            ),
70 71
            bias_attr=False,
        )
72 73

    def forward(self, x, y):
74 75
        a = self.linear1(x)
        b = self.linear2(y)
76
        b.stop_gradient = True
Z
zqw_1997 已提交
77
        c = paddle.matmul(a, b)
78
        d = paddle.mean(c)
79 80 81 82
        return d


class AutoPruneLayer2(fluid.Layer):
83
    def __init__(self, input_size):
84
        super().__init__()
85 86
        self.linear = paddle.nn.Linear(input_size, 10)
        self.linear2 = paddle.nn.Linear(1, 1)
87 88

    def forward(self, x, label):
89 90
        feature = self.linear(x)
        label = self.linear2(label)
91 92
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
93 94 95 96
        # Note that the label is not persistable in paddle.nn.functional.cross_entropy.
        loss = paddle.nn.functional.cross_entropy(
            input=feature, label=label, reduction='none', use_softmax=False
        )
97
        loss = paddle.mean(loss)
98 99 100 101
        return loss


class AutoPruneLayer3(fluid.Layer):
102
    def __init__(self, input_size):
103
        super().__init__()
104
        self.linear = paddle.nn.Linear(input_size, 20)
105 106

    def forward(self, x, label, test_num):
107
        feature = self.linear(x)
108
        part1, part2 = paddle.split(feature, num_or_sections=[10, 10], axis=1)
109
        # Note that: part2 is not used.
110 111 112
        loss = paddle.nn.functional.cross_entropy(
            input=part1, label=label, reduction='none', use_softmax=False
        )
113
        loss = paddle.mean(loss)
114 115 116 117 118 119 120
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
121
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
122
        super().__init__(dtype=dtype)
123 124
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
125 126
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
127 128

    def forward(self, x):
129
        # this method involves only the linear layers
130
        loss = paddle.mean(self.linear_0(x) + self.linear_1(x))
131 132 133
        return loss

    def linear0(self, x):
134
        loss = paddle.mean(self.linear_0(x))
135 136 137
        return loss

    def embed_linear0(self, x):
138
        loss = paddle.mean(self.linear_0(self.embed0(x)))
139 140 141 142
        return loss


class MyLayer2(fluid.Layer):
143
    def __init__(self, input_size, vocab_size, size, dtype="float32"):
144
        super().__init__(dtype=dtype)
145 146
        self.embed0 = Embedding(vocab_size, size)
        self.embed1 = Embedding(vocab_size, size)
147 148
        self.linear_0 = paddle.nn.Linear(input_size, size)
        self.linear_1 = paddle.nn.Linear(input_size, size)
149 150 151 152

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
153
        loss = paddle.mean(
154 155 156
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
157 158 159
        return loss

    def linear0(self, x):
160
        loss = paddle.mean(self.linear_0(x))
161 162 163
        return loss

    def embed_linear0(self, x):
164
        loss = paddle.mean(self.linear_0(self.embed0(x)))
165 166 167 168
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
169
    def test_auto_prune(self):
170
        with fluid.dygraph.guard():
171
            case1 = AutoPruneLayer0(input_size=5)
172 173 174 175 176 177
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
178 179
            self.assertIsNotNone(case1.linear2.weight._grad_ivar())
            self.assertIsNotNone(case1.linear1.weight._grad_ivar())
180

181
    def test_auto_prune2(self):
182
        with fluid.dygraph.guard():
183
            case2 = AutoPruneLayer1(input_size=5)
184 185 186 187 188
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
189

190
            loss.backward()
191 192
            self.assertIsNone(case2.linear2.weight._grad_ivar())
            self.assertIsNotNone(case2.linear1.weight._grad_ivar())
193

194
    # TODO(jiabin): Support this when we support better split tensor
195 196
    def test_auto_prune3(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
197
        with fluid.dygraph.guard():
198
            case3 = AutoPruneLayer3(input_size=784)
199 200 201 202 203 204
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
205
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
206
            self.assertTrue((part2.gradient() == 0).all())
207
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
208

209 210
    def test_auto_prune4(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
211
        with fluid.dygraph.guard():
212
            case4 = AutoPruneLayer3(input_size=784)
213 214 215 216 217 218
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
219
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
220
            self.assertTrue((part2.gradient() == 1).all())
221
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
222

223 224
    def test_auto_prune5(self):
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
225
        with fluid.dygraph.guard():
226
            case4 = AutoPruneLayer3(input_size=784)
227 228 229 230 231 232
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
233
            self.assertIsNotNone(case4.linear.weight._grad_ivar())
234
            self.assertTrue((part2.gradient() == 0).all())
235
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
236

237
    def test_auto_prune6(self):
238 239 240 241
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
242 243
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
244 245 246
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
247 248
            out1 = linear(a)
            out2 = linear2(b)
249 250 251
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
252 253
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
254

255
    def test_auto_prune7(self):
256 257 258 259
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
260 261
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
262 263 264
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
265 266
            out1 = linear(a)
            out2 = linear2(b)
267 268
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
269
            out.backward()
270 271
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
272

273
    def test_auto_prune8(self):
274 275 276 277
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
278 279
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
280 281 282
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
283 284 285 286 287
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
            linear2.weight.stop_gradient = True
288
            out2.backward()
289 290
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
291 292
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
293
            optimizer.minimize(out2)
294 295 296
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
297
            self.assertFalse(
298 299
                np.array_equal(linear_origin, linear.weight.numpy())
            )
300

301
    def test_auto_prune9(self):
302 303 304 305
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
306 307
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(5, 3)
308 309 310
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
311 312 313 314
            out1 = linear(a)
            linear_origin = linear.weight.numpy()
            out2 = linear2(out1)
            linear2_origin = linear2.weight.numpy()
315 316
            out2.stop_gradient = True
            out2.backward()
317 318
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
319 320
                parameter_list=(linear.parameters() + linear2.parameters()),
            )
321
            optimizer.minimize(out2)
322 323 324
            np.testing.assert_array_equal(
                linear2_origin, linear2.weight.numpy()
            )
325
            np.testing.assert_array_equal(linear_origin, linear.weight.numpy())
326
            try:
327
                linear2.weight.gradient()
328 329 330
            except ValueError as e:
                assert type(e) == ValueError

331
    def test_auto_prune10(self):
332 333 334 335
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
336 337
            linear = paddle.nn.Linear(13, 5)
            linear2 = paddle.nn.Linear(3, 3)
338 339 340
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
341 342
            out1 = linear(a)
            out2 = linear2(b)
343 344
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
345
            # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
346 347
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
348 349
            self.assertIsNone(linear.weight.gradient())
            self.assertIsNone(out1.gradient())
350

351
    def test_auto_prune_with_optimizer(self):
352 353 354 355
        vocab_size = 100
        size = 20
        batch_size = 16

356 357 358
        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)
        ).astype("int64")
359 360 361 362
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
363
            model = MyLayer(size, vocab_size, size)
364
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
365
            optimizer = fluid.optimizer.AdamOptimizer(
366 367
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
368
            indices = fluid.dygraph.to_variable(indices)
369
            embed = fluid.dygraph.to_variable(embed)
370 371 372 373
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
374
            _, params_grads = optimizer.minimize(loss)
375
            for items in params_grads:
376
                assert items[0].name is not model.embed1.weight.name
377
                assert items[0].name is not model.linear_1.weight.name
378
            assert model.embed1.weight._grad_ivar() is None
379
            assert model.linear_1.weight._grad_ivar() is None
380 381

        with fluid.dygraph.guard(place):
382
            model = MyLayer2(size, vocab_size, size)
383
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
384
            optimizer = fluid.optimizer.AdamOptimizer(
385 386
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip
            )
387 388 389 390 391 392 393

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
394
            optimizer.minimize(loss)
395
            for items in params_grads:
396
                assert items[0].name is not model.embed1.weight.name
397
                assert items[0].name is not model.linear_1.weight.name
398
            assert model.embed1.weight._grad_ivar() is None
399
            assert model.linear_1.weight._grad_ivar() is None
400

401
    def test_case2_prune_no_grad_branch(self):
402 403 404 405 406
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
407
            case3 = AutoPruneLayer2(input_size=784)
408 409
            loss = case3(v1, v2)
            loss.backward()
410 411
            self.assertIsNone(case3.linear2.weight._grad_ivar())
            self.assertIsNotNone(case3.linear.weight._grad_ivar())
412

413
    def test_case3_prune_no_grad_branch2(self):
414 415
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
416
            linear = paddle.nn.Linear(1, 1)
417
            label = fluid.dygraph.to_variable(value1).astype("float32")
418
            label = linear(label)
419 420
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
421
            out = paddle.nn.functional.one_hot(label, 100)
422
            loss = paddle.mean(out)
423
            loss.backward()
424
            self.assertIsNone(linear.weight._grad_ivar())
425

426
    def test_case4_with_no_grad_op_maker(self):
427
        with fluid.dygraph.guard():
428
            out = random.gaussian(shape=[20, 30])
429
            loss = paddle.mean(out)
430
            loss.backward()
431
            self.assertIsNone(out._grad_ivar())
432 433 434 435


if __name__ == '__main__':
    unittest.main()