elastic.py 2.3 KB
Newer Older
K
kuizhiqing 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import six
import os


class Command(object):
    def __init__(self, server, name):
        import etcd3

        srv, port = server.split(':')
        self.etcd = etcd3.client(host=srv, port=port)

        self.prefix = "/paddle/" + name
        self.node_prefix = self.prefix + '/nodes'
        self.np_path = self.prefix + '/np'

    def set_np(self, np):
        self.etcd.put(self.np_path, six.b('{}'.format(np)))

    def scale_np(self, np):
        if self.etcd.get(self.np_path)[0] != None:
            self.set_np(np)
            return True
        return False

    def close(self):
        self.etcd.close()


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Elastic Command')
    parser.add_argument(
        "--elastic_server", type=str, help="etcd server host:port")
    parser.add_argument("--job_id", type=str, help="job unique id")
    parser.add_argument("--np", type=int, help="job pod/node number")
    parser.add_argument("action", type=str, help="action to take")

    args = parser.parse_args()

    server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
    # compatible with kuberntes service discovery
    if not server and os.getenv(
            'PADDLE_ELASTIC_ETCD_SERVICE_HOST') and os.getenv(
                'PADDLE_ELASTIC_ETCD_SERVICE_PORT'):
        server = '{}:{}'.format(
            os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_HOST'),
            os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_PORT'))
    name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID')

    np = args.np or int(os.getenv('PADDLE_ELASTIC_NP', 0))

    cmd = Command(server, name)

    if args.action == "scale":
        cmd.scale_np(np)

    print("action {} done".format(args.action))

    cmd.close()