test_fleet_with_asp_sharding.py 5.1 KB
Newer Older
M
minghaoBD 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
import os
M
minghaoBD 已提交
17
import unittest
18 19 20

import numpy as np

M
minghaoBD 已提交
21
import paddle
22 23
from paddle import fluid
from paddle.distributed import fleet
24 25
from paddle.incubate import asp as sparsity
from paddle.incubate.asp import ASPHelper
26

M
minghaoBD 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
if cuda_visible_devices is None or cuda_visible_devices == "":
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]

paddle.enable_static()


class TestFleetWithASPSharding(unittest.TestCase):
    def setUp(self):
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
        os.environ["PADDLE_TRAINERS_NUM"] = "1"
        os.environ["PADDLE_TRAINER_ID"] = "0"

        os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = "0.1"
        os.environ['FLAGS_sync_nccl_allreduce'] = "1"
        os.environ['FLAGS_eager_delete_tensor_gb'] = "0"
        os.environ['FLAGS_fuse_parameter_memory_size'] = "32"
        os.environ['FLAGS_fuse_parameter_groups_size'] = "50"
        os.environ['FLAGS_check_nan_inf'] = "0"

    def net(self, main_prog, startup_prog):
        with fluid.program_guard(main_prog, startup_prog):
52 53 54
            input_x = paddle.static.data(
                name="x", shape=[-1, 32], dtype='float32'
            )
M
minghaoBD 已提交
55 56
            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')

C
Charles-hit 已提交
57 58 59 60 61 62 63
            fc_1 = paddle.static.nn.fc(x=input_x, size=64, activation='tanh')
            fc_2 = paddle.static.nn.fc(x=fc_1, size=64, activation='tanh')
            fc_3 = paddle.static.nn.fc(x=fc_2, size=64, activation='tanh')
            fc_4 = paddle.static.nn.fc(x=fc_3, size=64, activation='tanh')
            prediction = paddle.static.nn.fc(
                x=fc_4, size=2, activation='softmax'
            )
64 65 66 67 68 69
            cost = paddle.nn.functional.cross_entropy(
                input=prediction,
                label=input_y,
                reduction='none',
                use_softmax=False,
            )
M
minghaoBD 已提交
70 71 72 73 74 75 76 77 78 79 80
            avg_cost = paddle.mean(x=cost)

            dist_strategy = paddle.distributed.fleet.DistributedStrategy()
            dist_strategy.sharding = True
            dist_strategy.sharding_configs = {
                "sharding_segment_strategy": "segment_broadcast_MB",
                "segment_broadcast_MB": 32,
                "segment_anchors": None,
                "sharding_degree": 8,
                "mp_degree": 1,
                "hybrid_dp": False,
81
                "gradient_merge_acc_step": 1,
M
minghaoBD 已提交
82 83 84 85 86 87 88 89
            }
            dist_strategy.nccl_comm_num = 1
            dist_strategy.asp = True
        return avg_cost, dist_strategy, input_x, input_y

    def test_with_asp_sharding(self):
        fleet.init(is_collective=True)
        train_prog, startup_prog = fluid.Program(), fluid.Program()
90 91 92
        avg_cost, strategy, input_x, input_y = self.net(
            train_prog, startup_prog
        )
M
minghaoBD 已提交
93 94 95

        with fluid.program_guard(train_prog, startup_prog):
            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
96 97 98
            optimizer = fleet.distributed_optimizer(
                optimizer, strategy=strategy
            )
M
minghaoBD 已提交
99 100 101 102
            optimizer.minimize(avg_cost)

        if paddle.fluid.is_compiled_with_cuda():
            place = fluid.CUDAPlace(
103 104
                int(os.environ.get('FLAGS_selected_gpus', 0))
            )
M
minghaoBD 已提交
105 106 107 108 109 110 111
        else:
            place = fluid.CPUPlace()

        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
        exe.run(startup_prog)

B
Baibaifan 已提交
112
        sparsity.prune_model(train_prog)
M
minghaoBD 已提交
113 114 115 116 117 118

        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
        exe.run(train_prog, feed=feeder.feed([data]))

        for param in train_prog.global_block().all_parameters():
            if ASPHelper._is_supported_layer(train_prog, param.name):
119 120 121 122 123 124
                mat = np.array(
                    fluid.global_scope().find_var(param.name).get_tensor()
                )
                if (len(param.shape) == 4 and param.shape[1] < 4) or (
                    len(param.shape) == 2 and param.shape[0] < 4
                ):
M
minghaoBD 已提交
125
                    self.assertFalse(
126
                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
127
                    )
M
minghaoBD 已提交
128 129
                else:
                    self.assertTrue(
130
                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
131
                    )
M
minghaoBD 已提交
132 133 134 135


if __name__ == "__main__":
    unittest.main()