test_imperative_auto_prune.py 16.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import paddle.fluid as fluid
import numpy as np


class AutoPruneLayer0(fluid.Layer):
    def __init__(self, name_scope):
        super(AutoPruneLayer0, self).__init__(name_scope)
        self.fc1 = fluid.dygraph.FC(
            "FC_1",
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)
        self.fc2 = fluid.dygraph.FC(
            "FC_2",
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)

    def forward(self, x, y):
        a = self.fc1(x)
        b = self.fc2(y)
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer1(fluid.Layer):
    def __init__(self, name_scope):
        super(AutoPruneLayer1, self).__init__(name_scope)
        self.fc1 = fluid.dygraph.FC(
            "FC_1",
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)
        self.fc2 = fluid.dygraph.FC(
            "FC_2",
            5,
            param_attr=fluid.initializer.ConstantInitializer(value=2),
            bias_attr=False)

    def forward(self, x, y):
        a = self.fc1(x)
        b = self.fc2(y)
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
        d = fluid.layers.reduce_mean(c)
        return d


class AutoPruneLayer2(fluid.Layer):
    def __init__(self, name_scope):
        super(AutoPruneLayer2, self).__init__(name_scope)
        self.fc = fluid.dygraph.FC("FC1", size=10, act=None)
        self.fc2 = fluid.dygraph.FC("FC2", size=1, act=None)

    def forward(self, x, label):
        feature = self.fc(x)
        label = self.fc2(label)
        label = fluid.layers.cast(label, dtype="float32")
        label = fluid.layers.cast(label, dtype='int64')
        # Note that the label is not persistable in fluid.layers.cross_entropy.
        loss = fluid.layers.cross_entropy(input=feature, label=label)
        loss = fluid.layers.mean(loss)
        return loss


class AutoPruneLayer3(fluid.Layer):
    def __init__(self, name_scope):
        super(AutoPruneLayer3, self).__init__(name_scope)
        self.fc = fluid.dygraph.FC("FC1", size=20, act=None)

    def forward(self, x, label, test_num):
        feature = self.fc(x)
        part1, part2 = fluid.layers.split(
            feature, num_or_sections=[10, 10], dim=1)
        # Note that: part2 is not used.
        loss = fluid.layers.cross_entropy(input=part1, label=label)
        loss = fluid.layers.mean(loss)
        if test_num == 1:
            return loss, part2
        else:
            return loss, part1, part2


class MyLayer(fluid.Layer):
    def __init__(self, name_scope, vocab_size, size, dtype="float32"):
        super(MyLayer, self).__init__(name_scope, dtype)
103 104
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
        self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
        self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)

    def forward(self, x):
        # this method involves only the fc layers
        loss = fluid.layers.reduce_mean(self.fc0(x) + self.fc1(x))
        return loss

    def linear0(self, x):
        loss = fluid.layers.reduce_mean(self.fc0(x))
        return loss

    def embed_linear0(self, x):
        loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
        return loss


class MyLayer2(fluid.Layer):
    def __init__(self, name_scope, vocab_size, size, dtype="float32"):
        super(MyLayer2, self).__init__(name_scope, dtype)
125 126
        self.embed0 = fluid.Embedding(size=(vocab_size, size))
        self.embed1 = fluid.Embedding(size=(vocab_size, size))
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
        self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
        self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)

    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
        loss = fluid.layers.reduce_mean(
            self.fc0(self.embed0(indices)) + self.fc1(self.embed1(indices)))
        return loss

    def linear0(self, x):
        loss = fluid.layers.reduce_mean(self.fc0(x))
        return loss

    def embed_linear0(self, x):
        loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
        return loss


class TestImperativeAutoPrune(unittest.TestCase):
    def test_auto_prune(self):
        with fluid.dygraph.guard():
            case1 = AutoPruneLayer0("l1")
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case1(v1, v2)
            loss.backward()
156 157
            self.assertTrue(case1.fc2.weight._grad_ivar() is not None)
            self.assertTrue(case1.fc1.weight._grad_ivar() is not None)
158 159 160 161 162 163 164 165 166

    def test_auto_prune2(self):
        with fluid.dygraph.guard():
            case2 = AutoPruneLayer1("l1")
            value1 = np.arange(25).reshape(5, 5).astype("float32")
            value2 = np.arange(25).reshape(5, 5).astype("float32")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss = case2(v1, v2)
H
hong 已提交
167

168
            loss.backward()
169 170
            self.assertTrue(case2.fc2.weight._grad_ivar() is None)
            self.assertTrue(case2.fc1.weight._grad_ivar() is not None)
171 172 173 174 175 176 177 178 179 180

    def test_auto_prune3(self):
        with fluid.dygraph.guard():
            case3 = AutoPruneLayer3("l3")
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case3(v1, v2, 1)
            loss.backward()
181
            self.assertTrue(case3.fc.weight._grad_ivar() is not None)
182 183 184 185 186 187 188 189 190 191 192
            self.assertTrue((part2.gradient() == 0).all())

    def test_auto_prune4(self):
        with fluid.dygraph.guard():
            case4 = AutoPruneLayer3("l3")
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part2 = case4(v1, v2, 1)
            part2.backward()
193
            self.assertTrue(case4.fc.weight._grad_ivar() is not None)
194 195 196 197 198 199 200 201 202 203 204
            self.assertTrue((part2.gradient() == 1).all())

    def test_auto_prune5(self):
        with fluid.dygraph.guard():
            case4 = AutoPruneLayer3("l3")
            value1 = np.arange(784).reshape(1, 784).astype("float32")
            value2 = np.arange(1).reshape(1, 1).astype("int64")
            v1 = fluid.dygraph.to_variable(value1)
            v2 = fluid.dygraph.to_variable(value2)
            loss, part1, part2 = case4(v1, v2, 2)
            part1.backward()
205
            self.assertTrue(case4.fc.weight._grad_ivar() is not None)
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
            self.assertTrue((part2.gradient() == 0).all())

    def test_auto_prune6(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
            fc = fluid.FC("fc1", size=5, dtype="float32")
            fc2 = fluid.FC("fc2", size=3, dtype="float32")
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
            out1 = fc(a)
            out2 = fc2(b)
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            out.backward()
223
            self.assertTrue((fc.weight.gradient() == 0).all())
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
            self.assertTrue((out1.gradient() == 0).all())

    def test_auto_prune7(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
            fc = fluid.FC("fc1", size=5, dtype="float32")
            fc2 = fluid.FC("fc2", size=3, dtype="float32")
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
            out1 = fc(a)
            out2 = fc2(b)
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            backward_strategy = fluid.dygraph.BackwardStrategy()
            out.backward(backward_strategy)
242
            self.assertTrue((fc.weight.gradient() == 0).all())
243 244 245 246 247 248 249 250 251 252 253 254 255
            self.assertTrue((out1.gradient() == 0).all())

    def test_auto_prune8(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
            fc = fluid.FC("fc1", size=5, dtype="float32")
            fc2 = fluid.FC("fc2", size=3, dtype="float32")
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
            out1 = fc(a)
256
            fc_origin = fc.weight.numpy()
257
            out2 = fc2(out1)
258 259
            fc2_origin = fc2.weight.numpy()
            fc2.weight.stop_gradient = True
260
            out2.backward()
261 262 263
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
                parameter_list=(fc.parameters() + fc2.parameters()))
264
            optimizer.minimize(out2)
265 266
            self.assertTrue(np.array_equal(fc2_origin, fc2.weight.numpy()))
            self.assertFalse(np.array_equal(fc_origin, fc.weight.numpy()))
267 268 269 270 271 272 273 274 275 276 277 278

    def test_auto_prune9(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
            fc = fluid.FC("fc1", size=5, dtype="float32")
            fc2 = fluid.FC("fc2", size=3, dtype="float32")
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
            out1 = fc(a)
279
            fc_origin = fc.weight.numpy()
280
            out2 = fc2(out1)
281
            fc2_origin = fc2.weight.numpy()
282 283
            out2.stop_gradient = True
            out2.backward()
284 285 286
            optimizer = fluid.optimizer.SGD(
                learning_rate=0.003,
                parameter_list=(fc.parameters() + fc2.parameters()))
287
            optimizer.minimize(out2)
288 289
            self.assertTrue(np.array_equal(fc2_origin, fc2.weight.numpy()))
            self.assertTrue(np.array_equal(fc_origin, fc.weight.numpy()))
290
            try:
291
                fc2.weight.gradient()
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
            except ValueError as e:
                assert type(e) == ValueError

    def test_auto_prune10(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
            value2 = np.arange(10).reshape(2, 5).astype("float32")
            fc = fluid.FC("fc1", size=5, dtype="float32")
            fc2 = fluid.FC("fc2", size=3, dtype="float32")
            a = fluid.dygraph.to_variable(value0)
            b = fluid.dygraph.to_variable(value1)
            c = fluid.dygraph.to_variable(value2)
            out1 = fc(a)
            out2 = fc2(b)
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
            backward_strategy = fluid.dygraph.BackwardStrategy()
            backward_strategy.sort_sum_gradient = True
            out.backward(backward_strategy)
312
            self.assertTrue((fc.weight.gradient() == 0).all())
313 314 315 316 317 318 319 320 321 322 323 324 325 326
            self.assertTrue((out1.gradient() == 0).all())

    def test_auto_prune_with_optimizer(self):
        vocab_size = 100
        size = 20
        batch_size = 16

        indices = np.random.randint(
            low=0, high=100, size=(batch_size, 1)).astype("int64")
        embed = np.random.randn(batch_size, size).astype("float32")

        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
            model = MyLayer("mylayer", vocab_size, size)
327 328
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
329 330 331 332 333 334 335 336 337 338
            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(embed)

            loss = model.embed_linear0(indices)
            loss.backward()
            _, params_grads = optimizer.minimize(loss, grad_clip=grad_clip)
            for items in params_grads:
339 340 341 342
                assert items[0].name is not model.embed1.weight.name
                assert items[0].name is not model.fc1.weight.name
            assert model.embed1.weight._grad_ivar() is None
            assert model.fc1.weight._grad_ivar() is None
343 344 345

        with fluid.dygraph.guard(place):
            model = MyLayer2("mylayer", vocab_size, size)
346 347
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
348 349 350 351 352 353 354 355 356 357
            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
            dummy_loss = model(indices)

            loss = model.embed_linear0(indices)
            loss.backward()
            optimizer.minimize(loss, grad_clip=grad_clip)
            for items in params_grads:
358 359 360 361
                assert items[0].name is not model.embed1.weight.name
                assert items[0].name is not model.fc1.weight.name
            assert model.embed1.weight._grad_ivar() is None
            assert model.fc1.weight._grad_ivar() is None
362 363 364 365 366 367 368 369 370 371

    def test_case2_prune_no_grad_branch(self):
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
            case3 = AutoPruneLayer2("l2")
            loss = case3(v1, v2)
            loss.backward()
372 373
            self.assertTrue(case3.fc2.weight._grad_ivar() is None)
            self.assertTrue(case3.fc.weight._grad_ivar() is not None)
374 375 376 377 378 379 380 381 382 383

    def test_case2_prune_no_grad_branch(self):
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
            v1 = fluid.dygraph.to_variable(value1).astype("float32")
            v2 = fluid.dygraph.to_variable(value2).astype("float32")
            case3 = AutoPruneLayer2("l2")
            loss = case3(v1, v2)
            loss.backward()
384 385
            self.assertTrue(case3.fc2.weight._grad_ivar() is None)
            self.assertTrue(case3.fc.weight._grad_ivar() is not None)
386 387 388 389 390 391 392 393 394 395 396 397

    def test_case3_prune_no_grad_branch2(self):
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
            fc = fluid.dygraph.FC("FC1", size=1, act=None)
            label = fluid.dygraph.to_variable(value1).astype("float32")
            label = fc(label)
            label = fluid.layers.cast(label, dtype="float32")
            label = fluid.layers.cast(label, dtype='int64')
            out = fluid.layers.one_hot(input=label, depth=100)
            loss = fluid.layers.mean(out)
            loss.backward()
398
            self.assertTrue(fc.weight._grad_ivar() is None)
399 400 401 402 403 404

    def test_case4_with_no_grad_op_maker(self):
        with fluid.dygraph.guard():
            out = fluid.layers.gaussian_random(shape=[20, 30])
            loss = fluid.layers.mean(out)
            loss.backward()
405
            self.assertTrue(out._grad_ivar() is None)
406 407 408 409


if __name__ == '__main__':
    unittest.main()