__init__.py 2.8 KB
Newer Older
K
kuizhiqing 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import six

__all__ = []


K
kuizhiqing 已提交
20
# print configuration after args are well filled in controller init
K
kuizhiqing 已提交
21 22 23 24 25 26 27
def log(ctx):
    ctx.logger.info("-----------  Configuration  ----------------------")
    for arg, value in sorted(six.iteritems(vars(ctx.args))):
        ctx.logger.info("%s: %s" % (arg, value))
    ctx.logger.info("--------------------------------------------------")


28 29 30 31 32 33 34 35 36 37 38 39 40 41
def rewrite_ipu_script(ctx):
    import paddle.fluid as fluid
    if fluid.core.is_compiled_with_ipu():
        import os
        if ctx.args.training_script != "ipu":
            raise RuntimeError(
                "Only support to run the script \'ipu\' for IPU distributed computing."
            )
        ctx.args.training_script = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.dirname(__file__)),
                "utils/ipu_launch.py"))


K
kuizhiqing 已提交
42 43 44 45 46
def process_args(ctx):
    # reset device by args
    #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
    argdev = ctx.args.devices
    if argdev:
K
kuizhiqing 已提交
47 48 49
        for d in argdev.split(','):
            assert d in ctx.node.device.labels, 'Device not found {}'.format(
                argdev)
K
kuizhiqing 已提交
50 51 52 53


def collective_compatible(ctx):
    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
54 55 56 57 58 59 60
        eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
        hosts = set([h.split(':')[0] for h in eps])
        ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
        ctx.args.nnodes = len(hosts)
        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
            eps))
    '''
K
kuizhiqing 已提交
61
    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
62 63 64 65 66 67 68
        eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
        hosts = set([h.split(':')[0] for h in eps])
        ctx.args.master = eps[0]
        ctx.args.nnodes = len(hosts)
        ctx.logger.info(
            'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
    '''
K
kuizhiqing 已提交
69 70 71 72 73 74 75 76


def rewrite_host_ip(ctx):
    if ctx.args.host is not None and "." in ctx.args.host:
        ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host))
        ctx.node.ip = ctx.args.host


77 78 79
enabled_plugins = [
    collective_compatible, rewrite_host_ip, process_args, rewrite_ipu_script
]