diff --git a/python/paddle/fleet/launch.py b/python/paddle/fleet/launch.py index 8b08c916c84e098373e962b97656b786573b099a..a6f71f2ae94c0fbf2c3da96ce020900b9e747093 100644 --- a/python/paddle/fleet/launch.py +++ b/python/paddle/fleet/launch.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -paddle.distributed.launch is a module that spawns multiple distributed +fleetrun is a module that spawns multiple distributed process on each training node for gpu training and cpu training. Usage: In both of single node training or multiple node training, this module @@ -31,16 +31,26 @@ launch a process on each of the given gpu card or cpu machine. your_training_py (arg1 arg2 and all others) CPU training: 1. for single node training with multi servers and workers: - fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others) + fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others) 2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \ - with 2 servers and 4 workers. + with 2 servers and 4 workers. on 192.168.0.16: - fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \ - --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \ + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \ your_training_py (arg1 arg2 and all others) on 192.168.0.17: fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \ - --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \ + --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \ + your_training_py (arg1 arg2 and all others) + 3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \ + with 2 servers and 4 workers. (workers should set port) + on 192.168.0.16: + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \ + your_training_py (arg1 arg2 and all others) + on 192.168.0.17: + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \ your_training_py (arg1 arg2 and all others) """ @@ -215,6 +225,7 @@ def launch_collective(args): def launch_ps(args): ports = None + start_port = 6170 if args.server_num: server_num = args.server_num ports = get_ports(server_num, 0) @@ -240,11 +251,19 @@ def launch_ps(args): worker_endpoints_ips = [ x.strip().split(":")[0] for x in worker_endpoints.split(",") ] - worker_endpoints_port = [ - x.strip().split(":")[1] for x in worker_endpoints.split(",") - ] worker_num = len(worker_endpoints_ips) node_ips = list(set(server_endpoints_ips + worker_endpoints_ips)) + worker_endpoints_len = [ + len(x.strip().split(":")) for x in worker_endpoints.split(",") + ] + if 1 in worker_endpoints_len: + # if no port value in worker_endpoints, will set default port values. + worker_endpoints_port = range(start_port + server_num, + start_port + server_num + worker_num, 1) + else: + worker_endpoints_port = [ + x.strip().split(":")[1] for x in worker_endpoints.split(",") + ] # local train if len(set(node_ips)) == 1: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh index ebe99ffb635c4a8ddef2e038ef8bfea443fe63ef..c5edc96963408bf1fad793f7271d75159934f019 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh @@ -11,7 +11,15 @@ function test_launch_ps(){ exit -1 fi - fleetrun --servers="120.0.0.1:6780,120.0.0.1:6781" --workers="120.0.0.1:6782,120.0.0.1:6783" fleet_ps_training.py 2> ut.elog + fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog + if grep -q "server are killed" ut.elog; then + echo "test pserver launch succeed" + else + echo "test pserver launch failed" + exit -1 + fi + + fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" else