diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 129453149c6d377e871b2c3b807d92ea10b7a591..1f4648f7963811e50d890a60aa10a399e7681f84 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -39,6 +39,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend) +list(APPEND MIXED_DIST_TEST_OPS test_ascend_group) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) @@ -524,6 +525,7 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) # port range (20000, 23000) is reserved for dist-ops set(dist_ut_port 20001) diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py new file mode 100644 index 0000000000000000000000000000000000000000..2d5b709a48eefffe5b2b0a5f328fc3bdd40b919a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ascend_group.py @@ -0,0 +1,118 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import paddle.fluid as fluid +from paddle.fluid import unique_name +import paddle.fluid.core as core +import paddle +from paddle.fluid.layer_helper import LayerHelper +from paddle.distributed import fleet + +paddle.enable_static() + +OpRole = core.op_proto_and_checker_maker.OpRole +OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() +OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + +role = fleet.PaddleCloudRoleMaker(is_collective=True) +fleet.init(role) + +def init_communicator(startup_program, main_program, current_endpoint, endpoints, ring_id): + nranks = len(endpoints) + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + group_rank=endpoints.index(current_endpoint) + assert group_rank >=0 + + block = startup_program.global_block() + nccl_id_var = block.create_var( + name=unique_name.generate('nccl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW) + block.append_op( + type='c_gen_nccl_id', + inputs={}, + outputs={'Out': nccl_id_var}, + attrs={ + 'rank': group_rank, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints, + OP_ROLE_KEY: OpRole.Forward, + }) + block.append_op( + type='c_comm_init', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'nranks': nranks, + 'rank': group_rank, + 'ring_id': ring_id, + OP_ROLE_KEY: OpRole.Forward, + }) + block.create_var( + name="data", + persistable=True, + dtype='float32') + + with fluid.program_guard(main_program): + op_type="c_allreduce_sum" + data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [data]}, + outputs={'Out': [data]}, + attrs={'ring_id': ring_id, + 'use_calc_stream': True}) + +def train(world_endpoints, world_device_ids, local_device_ids,local_rank): + startup_programs=[] + main_programs=[] + + #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"] + trainer_endpoints=world_endpoints + groups=[[], [], []] + groups[0]=[trainer_endpoints[0], trainer_endpoints[1]] + groups[1]=[trainer_endpoints[2], trainer_endpoints[3]] + groups[2]=[trainer_endpoints[0], trainer_endpoints[2]] + + for i in range(len(trainer_endpoints)): + startup_programs.append(fluid.Program()) + main_programs.append(fluid.Program()) + + for idx, group in enumerate(groups): + for te in group: + te_idx = trainer_endpoints.index(te) + startup_program = startup_programs[te_idx] + main_program=main_programs[te_idx] + init_communicator(startup_program, main_program, te, group, idx) + + print(len(startup_programs)) + print(startup_programs[local_rank]) + print(main_programs[local_rank]) + +worker_endpoints=fleet.worker_endpoints() +world_device_ids=fleet.world_device_ids() +local_device_ids=fleet.local_device_ids() +local_rank=int(fleet.local_rank()) + +print("worker_endpoints:", worker_endpoints) +print("world_device_ids:", world_device_ids) +print("local_device_ids:", local_device_ids) +print("local_rank:", local_rank) + +train(worker_endpoints, world_device_ids,local_device_ids,local_rank) diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh new file mode 100644 index 0000000000000000000000000000000000000000..5f901d59ad4829ae4ff8f46417260f0dbec89598 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +cluster_node_ips="127.0.0.1" +export PADDLE_TRAINERS_NUM=4 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=4 + +distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" +python -m paddle.distributed.fleet.launch ${distributed_args} \ + ascend_group.py fleetascendgroup \ No newline at end of file