import os def dist_env(): """ Return a dict of all variable that distributed training may use. NOTE: you may rewrite this function to suit your cluster environments. """ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) num_trainers = 1 training_role = os.getenv("TRAINING_ROLE", "TRAINER") assert(training_role == "PSERVER" or training_role == "TRAINER") # - PADDLE_TRAINER_ENDPOINTS means nccl2 mode. # - PADDLE_PSERVER_ENDPOINTS means pserver mode. # - PADDLE_CURRENT_ENDPOINT means current process endpoint. worker_endpoints = [] port = os.getenv("PADDLE_PORT", "8701") if os.getenv("PADDLE_TRAINER_ENDPOINTS"): trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") else:# for paddlecloud worker_ips = os.getenv("PADDLE_TRAINERS", "") for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) trainer_endpoints = ",".join(worker_endpoints) pserver_ips = os.getenv("PADDLE_PSERVERS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) if os.getenv("PADDLE_CURRENT_ENDPOINT"): current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") else:# for paddlecloud current_endpoint = os.getenv("POD_IP", "") + ":" + port if trainer_endpoints: trainer_endpoints = trainer_endpoints.split(",") num_trainers = len(trainer_endpoints) elif pserver_endpoints: num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) return { "trainer_id": trainer_id, "num_trainers": num_trainers, "current_endpoint": current_endpoint, "training_role": training_role, "pserver_endpoints": pserver_endpoints, "trainer_endpoints": trainer_endpoints }