ps_program_builder.py 10.5 KB
Newer Older
Z
ziyoujiyi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from .public import *
from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
from paddle.distributed.passes import new_pass, PassContext


class PsProgramBuilder(object):
    def __init__(self, pass_ctx):
        self.pass_ctx = pass_ctx
        self.attrs = self.pass_ctx._attrs
        self.loss = self.attrs['loss']
        self.cloned_main = self.attrs['cloned_main']
        self.cloned_startup = self.attrs['cloned_startup']

        self.use_ps_gpu = self.attrs['use_ps_gpu']
        self.use_heter_ps = self.attrs['is_heter_ps_mode']
        self.is_worker = self.attrs['is_worker']
        self.is_heter_worker = self.attrs['is_heter_worker']
        self.ps_mode = self.attrs['ps_mode']

        self.launch_barrier = self.attrs['launch_barrier']
        self.launch_barrier_flag = self.attrs['launch_barrier_flag']
        self.server_endpoints = self.attrs['role_maker']._get_pserver_endpoints(
        )

    def _optimize_programs(self):
        pass

    def _build_trainer_programs(self):
Z
ziyoujiyi 已提交
44
        raise NotImplementedError
Z
ziyoujiyi 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

    def _build_pserver_programs(self):
        is_sgd_adam = False
        ops = get_optimize_ops(self.attrs['origin_main_program'])
        if len(ops) == 0:
            return
        add_lr_decay_table_pass = new_pass('add_lr_decay_table_pass',
                                           self.attrs)
        add_lr_decay_table_pass.apply([], [], self.pass_ctx)
        for op in ops:
            if op.type in ["sgd", "adam"]:
                is_sgd_adam = True
                break
        if is_sgd_adam:
            return

    def _build_programs(self):
        if self.attrs['is_worker']:
Z
ziyoujiyi 已提交
63
            logger.info("start building trainer program")
Z
ziyoujiyi 已提交
64 65 66 67 68
            self._build_trainer_programs()
            fluid.framework.switch_startup_program(self.cloned_startup)
            self.loss.block.program = self.cloned_main

        elif self.attrs['is_server']:
Z
ziyoujiyi 已提交
69
            logger.info("start building pserver program")
Z
ziyoujiyi 已提交
70 71 72 73 74 75 76 77
            self._build_pserver_programs()
            self.loss.block.program = self.attrs['_main_server']
            fluid.framework.switch_startup_program(self.attrs[
                '_startup_server'])


class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
    def __init__(self, pass_ctx):
Z
ziyoujiyi 已提交
78
        logger.info("start building geo-ps program")
Z
ziyoujiyi 已提交
79 80 81
        super(GeoPsProgramBuilder, self).__init__(pass_ctx)
        if self.ps_mode != DistributedMode.GEO:
            raise ValueError("ps mode: {} not matched {}",
W
wangguanqun 已提交
82
                             format(self.ps_mode, "GeoPsProgramBuilder"))
Z
ziyoujiyi 已提交
83 84 85 86 87 88 89 90

    def _build_trainer_programs(self):
        append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
        append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)

        self.attrs['origin_main_program'] = self.cloned_main

        if self.launch_barrier and self.launch_barrier_flag:
91
            wait_server_ready(self.server_endpoints)
Z
ziyoujiyi 已提交
92 93 94 95 96 97 98

        return


class CpuSyncPsProgramBuilder(PsProgramBuilder):
    def __init__(self, pass_ctx):
        super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
Z
ziyoujiyi 已提交
99 100
        if self.ps_mode == DistributedMode.SYNC:
            logger.info("start building cpu-sync-ps program")
W
wangguanqun 已提交
101
        if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
Z
ziyoujiyi 已提交
102
            raise ValueError("ps mode: {} not matched {}",
Z
ziyoujiyi 已提交
103
                             format(self.ps_mode, "PsProgramBuilder"))
Z
ziyoujiyi 已提交
104 105

    def _build_trainer_programs(self):
106 107
        print("build trainer program entry")
        print("before ps program builder program:", self.cloned_main)
Z
ziyoujiyi 已提交
108 109 110 111
        add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                           self.attrs)
        add_lr_decay_table_pass.apply([], [], self.pass_ctx)

112
        print("before distributed op pass")
Z
ziyoujiyi 已提交
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
        distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
        distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)

        delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs)
        delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx)

        append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
        append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)

        delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass",
                                               self.attrs)
        delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']],
                                          [self.cloned_startup], self.pass_ctx)

        fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
        fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)

        self.attrs['origin_main_program'] = self.cloned_main
        self.attrs['origin_startup_program'] = self.cloned_startup
132
        print("after ps program builder program:", self.cloned_main)
Z
ziyoujiyi 已提交
133 134

        if self.launch_barrier and self.launch_barrier_flag:
135
            wait_server_ready(self.server_endpoints)
Z
ziyoujiyi 已提交
136 137 138 139 140 141

        return


class CpuAsyncPsProgramBuilder(CpuSyncPsProgramBuilder):
    def __init__(self, pass_ctx):
Z
ziyoujiyi 已提交
142
        logger.info("start building cpu-async-ps program")
Z
ziyoujiyi 已提交
143 144 145
        super(CpuAsyncPsProgramBuilder, self).__init__(pass_ctx)


Z
ziyoujiyi 已提交
146
class GpuPsProgramBuilder(PsProgramBuilder):
Z
ziyoujiyi 已提交
147
    def __init__(self, pass_ctx):
Z
ziyoujiyi 已提交
148
        logger.info("start building gpu-ps program")
Z
ziyoujiyi 已提交
149 150 151
        super(GpuPsProgramBuilder, self).__init__(pass_ctx)

    def _build_trainer_programs(self):
Z
ziyoujiyi 已提交
152

Z
ziyoujiyi 已提交
153 154 155 156 157 158 159 160 161 162 163 164 165 166
        add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                           self.attrs)
        add_lr_decay_table_pass.apply([], [], self.pass_ctx)

        distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
        distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)

        fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
        fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)

        ps_gpu_pass = new_pass("ps_gpu_pass", self.attrs)
        ps_gpu_pass.apply([self.cloned_main], [None], self.pass_ctx)

        ps_transpile_pass = new_pass("ps_transpile_pass", self.attrs)
Z
ziyoujiyi 已提交
167 168
        ps_transpile_pass.apply([self.cloned_main], [self.cloned_startup],
                                self.pass_ctx)
Z
ziyoujiyi 已提交
169 170 171 172 173

        self.attrs['origin_main_program'] = self.cloned_main
        self.attrs['origin_startup_program'] = self.cloned_startup

        if self.launch_barrier and self.launch_barrier_flag:
174
            wait_server_ready(self.server_endpoints)
Z
ziyoujiyi 已提交
175 176 177 178 179 180

        return


class HeterAsyncPsProgramBuilder(PsProgramBuilder):
    def __init__(self, pass_ctx):
Z
ziyoujiyi 已提交
181
        logger.info("start building heter-async-ps program")
Z
ziyoujiyi 已提交
182 183 184 185
        super(HeterAsyncPsProgramBuilder, self).__init__(pass_ctx)
        if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[
                'is_heter_ps_mode'] == False:
            raise ValueError("ps mode: {} not matched {}",
W
wangguanqun 已提交
186
                             format(self.ps_mode, "HeterAsyncPsProgramBuilder"))
Z
ziyoujiyi 已提交
187 188 189 190 191 192 193

    def _build_trainer_programs(self):
        add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                           self.attrs)
        add_lr_decay_table_pass.apply([], [], self.pass_ctx)

        distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
194
        distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
Z
ziyoujiyi 已提交
195 196

        delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs)
197
        delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx)
Z
ziyoujiyi 已提交
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217

        append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
        append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)

        delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass",
                                               self.attrs)
        delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']],
                                          [self.cloned_startup], self.pass_ctx)

        fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
        fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)

        if self.is_heter_worker:
            split_heter_worker_ops_pass = new_pass(
                "split_heter_worker_ops_pass", self.attrs)
            split_heter_worker_ops_pass.apply([self.cloned_main], [None],
                                              self.pass_ctx)
        else:
            split_trainer_ops_pass = new_pass("split_trainer_ops_pass",
                                              self.attrs)
218 219
            split_trainer_ops_pass.apply([self.cloned_main], [None],
                                         self.pass_ctx)
Z
ziyoujiyi 已提交
220 221 222 223

        set_heter_pipeline_opt_pass = new_pass('set_heter_pipeline_opt_pass',
                                               self.attrs)
        set_heter_pipeline_opt_pass.apply([self.cloned_main],
224
                                          [self.cloned_startup], self.pass_ctx)
Z
ziyoujiyi 已提交
225 226

        if self.launch_barrier and self.launch_barrier_flag:
227
            wait_server_ready(self.server_endpoints)
Z
ziyoujiyi 已提交
228 229 230 231 232 233 234 235 236

        return

    def _build_programs(self):
        if self.attrs['is_worker'] or self.attrs['is_heter_worker']:
            self._build_trainer_programs()
            ps_set_heter_pipeline_opt_pass = new_pass(
                "set_heter_pipeline_opt_pass", self.attrs)
            ps_set_heter_pipeline_opt_pass.apply(
237
                [self.cloned_main], [self.cloned_startup], self.pass_ctx)
Z
ziyoujiyi 已提交
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257

        elif self.attrs['is_server']:
            self._build_pserver_programs()
            self.loss.block.program = self.attrs['_main_server']
            fluid.framework.switch_startup_program(self.attrs[
                '_startup_server'])


class FlPsProgramBuilder(PsProgramBuilder):
    def __init__(self, pass_ctx):
        super(FlPsProgramBuilder, self).__init__(pass_ctx)

    def _build_trainer_programs(self):
        pass

    def _build_pserver_programs(self):
        pass

    def _build_programs(self):
        pass