From 9252aa41f5af28f73f890b775ce2648c02c45724 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Mon, 21 Jan 2019 11:12:39 +0800
Subject: [PATCH] add multi process start script (#15381)

* add multi process start script test=develop

* refine tool test=develop
---
 tools/run_mp.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 tools/run_mp.py

diff --git a/tools/run_mp.py b/tools/run_mp.py
new file mode 100644
index 000000000..2485400ab
--- /dev/null
+++ b/tools/run_mp.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import subprocess
+import os
+import sys
+import time
+import argparse
+
+default_envs = {
+    "PADDLE_TRAINER_ENDPOINTS":
+    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
+    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+    "PATH": os.getenv("PATH"),
+    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+    "PADDLE_TRAINERS_NUM": "8",
+    "NCCL_DEBUG": "INFO",
+    "GLOG_v": "0",
+    "NCCL_SOCKET_IFNAME": "eth0",
+    "NCCL_IB_GID_INDEX": "3",
+    "NCCL_IB_RETRY_CNT": "0",
+}
+
+GPUS = 8
+
+
+def start_procs(gpus, cmd, log_dir):
+    procs = []
+    log_fns = []
+    os.system("mkdir -p %s" % log_dir)
+    # ======== update parent envs =======
+    for k, v in os.environ.items():
+        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
+            k.startswith("GLOG_"):
+            default_envs[k] = v
+
+    # ======== for dist training =======
+    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    current_ip = os.getenv("POD_IP", "127.0.0.1")
+    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
+    num_nodes = len(trainer_ips)
+    all_nodes_devices_endpoints = ""
+    for n in trainer_ips:
+        for i in range(gpus):
+            if all_nodes_devices_endpoints:
+                all_nodes_devices_endpoints += ","
+            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
+    nranks = num_nodes * gpus
+    # ======== for dist training =======
+
+    for i in range(gpus):
+        curr_env = {}
+        curr_env.update(default_envs)
+        curr_env.update({
+            "FLAGS_selected_gpus": "%d" % i,
+            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            # nranks
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
+        })
+
+        print("starting process ", i, cmd, curr_env)
+        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
+        log_fns.append(fn)
+        procs.append(
+            subprocess.Popen(
+                cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
+
+    for i in range(gpus):
+        try:
+            procs[i].communicate()
+            procs[i].terminate()
+            log_fns[i].close()
+        except:
+            pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='''start paddle training using multi-process mode.
+NOTE: your train program ***must*** run as distributed nccl2 mode,
+see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
+And your train program must read environment variables below in order to let different
+process init properly:
+FLAGS_selected_gpus
+PADDLE_TRAINER_ID
+PADDLE_CURRENT_ENDPOINT
+PADDLE_TRAINERS_NUM
+PADDLE_TRAINER_ENDPOINTS
+POD_IP (current node ip address, not needed for local training)
+''')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=8,
+        help='start number of processes for every gpu')
+    parser.add_argument(
+        '--cmd',
+        type=str,
+        default="",
+        help='command to run for each process, e.g. python train.py --lr 0.1')
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default="mylog",
+        help='directory to put logs per process.')
+    args = parser.parse_args()
+    if args.cmd == "":
+        parser.print_help()
+        exit(0)
+    start_procs(args.gpus, args.cmd, args.log_dir)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab