From 2533cac66efadce1c28789d26c768c15c5b0cb1e Mon Sep 17 00:00:00 2001 From: zn <96479180+kangna-qi@users.noreply.github.com> Date: Fri, 25 Feb 2022 13:28:43 +0800 Subject: [PATCH] [MLU]support launch process on mlu (#39839) --- python/paddle/distributed/fleet/launch.py | 27 ++++++- .../paddle/distributed/fleet/launch_utils.py | 62 ++++++++++++++- .../fluid/tests/unittests/mlu/CMakeLists.txt | 19 ++++- .../tests/unittests/mlu/multi_process_mlu.py | 77 +++++++++++++++++++ .../tests/unittests/mlu/nproc_process_mlu.py | 38 +++++++++ .../unittests/mlu/test_launch_async_mlu.sh | 59 ++++++++++++++ .../unittests/mlu/test_launch_cloud_mlu.sh | 58 ++++++++++++++ .../unittests/mlu/test_launch_nproc_mlu.sh | 75 ++++++++++++++++++ 8 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 19306d3da99..0d985a52325 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -156,6 +156,16 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ) base_group.add_argument("--selected_npus", dest="npus") + if fluid.core.is_compiled_with_mlu(): + base_group.add_argument( + "--mlus", + type=str, + default=None, + help="It's for mlu training. For example: " + "--mlus=\"0,1,2,3\" will launch four training processes each bound to one mlu." + ) + base_group.add_argument("--selected_mlus", dest="mlus") + base_group.add_argument( "training_script", type=str, @@ -429,6 +439,8 @@ def infer_backend(args): args.backend = 'unknown' elif fluid.core.is_compiled_with_xpu(): args.backend = 'bkcl' + elif fluid.core.is_compiled_with_mlu(): + args.backend = 'cncl' else: args.backend = 'gloo' @@ -472,6 +484,8 @@ def which_distributed_mode(args): accelerators = fluid.core.get_npu_device_count() elif fluid.core.is_compiled_with_xpu(): accelerators = fluid.core.get_xpu_device_count() + elif fluid.core.is_compiled_with_mlu(): + accelerators = fluid.core.get_mlu_device_count() else: accelerators = 0 @@ -490,17 +504,18 @@ def which_distributed_mode(args): return DistributeMode.COLLECTIVE else: if not fluid.core.is_compiled_with_cuda( - ) and not fluid.core.is_compiled_with_xpu(): + ) and not fluid.core.is_compiled_with_xpu( + ) and not fluid.core.is_compiled_with_mlu(): if args.servers: logger.warning( - "Not found distinct arguments and not compiled with cuda or xpu or npu. " + "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. " "But found args.servers not empty, default use ps mode") return DistributeMode.PS else: return DistributeMode.COLLECTIVE else: logger.warning( - "Not found distinct arguments and compiled with cuda or xpu or npu. " + "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. " "Default use collective mode") return DistributeMode.COLLECTIVE @@ -536,6 +551,10 @@ def launch(): - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``. + - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu. + + - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``. + - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py`` - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1`` @@ -688,7 +707,7 @@ def launch(): check_backend(args.backend) distribute_mode = DistributeMode.COLLECTIVE - #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown'] + #assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown'] if args.backend == 'gloo': logger.warning("launch start with CPUONLY mode") diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index c20c209d601..2dec58c7538 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -57,6 +57,7 @@ class DeviceMode(): XPU = 2 ASCEND_NPU = 3 UNKNOWN = 3 + MLU = 4 class Cluster(object): @@ -287,7 +288,7 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, ), "current trainer_endpoints size should be greater equal than acclerators size." for i in range(len(devices_per_proc)): trainer = Trainer() - if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU: + if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU or device_mode == DeviceMode.MLU: if isinstance(devices_per_proc[i], (list, tuple)): trainer.accelerators.extend(devices_per_proc[i]) pod.accelerators.extend(devices_per_proc[i]) @@ -530,6 +531,9 @@ def start_local_trainers(cluster, accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU: proc_env["FLAGS_selected_npus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) + elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU: + proc_env["FLAGS_selected_mlus"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) if len(t.accelerators) > 0: proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( @@ -735,6 +739,35 @@ def get_npus(npus): return res_npus +def get_mlus(mlus): + if mlus is None: + mlus_num = fluid.core.get_mlu_device_count() + res_mlus = [str(x) for x in range(0, mlus_num)] + else: + mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES") + if mlu_visible_devices is None or mlu_visible_devices == "": + res_mlus = [x.strip() for x in mlus.split(',')] + else: + # change mlus into relative values + # e.g. MLU_VISIBLE_DEVICES=4,5,6,7; args.mlus=4,5,6,7; + # therefore mlus=0,1,2,3 + mlu_visible_devices_list = mlu_visible_devices.split(',') + for x in mlus.split(','): + assert x in mlu_visible_devices_list, "Can't find "\ + "your mlus %s in MLU_VISIBLE_DEVICES[%s]."\ + % (x, mlu_visible_devices) + res_mlus = [ + mlu_visible_devices_list.index(x.strip()) + for x in mlus.split(',') + ] + logger.info("Change selected_mlus into reletive values. --ips:{} " + "will change into relative_ips:{} according to your " + "MLU_VISIBLE_DEVICES:{}".format( + mlus, res_mlus, mlu_visible_devices_list)) + + return res_mlus + + def get_device_mode(backend): if backend == 'heter': if fluid.core.is_compiled_with_cuda() and \ @@ -763,6 +796,10 @@ def get_device_mode(backend): print("launch train in XPU mode") return DeviceMode.XPU + if backend == 'cncl' and fluid.core.get_mlu_device_count() > 0: + print("launch train in MLU mode") + return DeviceMode.MLU + if backend == 'gloo': print("launch train in CPU mode") return DeviceMode.CPU @@ -812,6 +849,18 @@ def get_device_proc_info(args): ] else: devices_per_proc = xpus + elif device_mode == DeviceMode.MLU: + mlus = get_mlus(args.mlus) + if args.nproc_per_node is not None: + assert (len(mlus) % int(args.nproc_per_node)) ==0, \ + "mlus' number:{} mod args.nproc_per_node:{} must == 0".format(len(mlus), args.nproc_per_node) + + n = int(len(mlus) / int(args.nproc_per_node)) + devices_per_proc = [ + mlus[i:i + n] for i in six.moves.range(0, len(mlus), n) + ] + else: + devices_per_proc = mlus elif device_mode == DeviceMode.CPU: if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None: #NOTE (xiongkun03) set it to cpu core number @@ -1719,7 +1768,7 @@ class ParameterServerLauncher(object): def check_backend(backend): - if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']: + if backend not in ['nccl', 'gloo', 'bkcl', 'cncl', 'auto', 'hccl', 'heter']: raise ValueError("paddle.distributed initialize error, " "backend argument can only be one of " "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' " @@ -1743,6 +1792,12 @@ def check_backend(backend): "your paddle is not compiled with npu but you assign 'hccl' as backend." ) + if backend == 'cncl' and not fluid.core.is_compiled_with_mlu(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with mlu but you assign 'cncl' as backend." + ) + def block_windows_and_macos(backend): if backend != 'gloo': return @@ -1766,4 +1821,7 @@ def get_backend_by_compile_flag(): if fluid.core.is_compiled_with_npu(): return 'hccl' + if fluid.core.is_compiled_with_mlu(): + return 'cncl' + return 'gloo' diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index 41f3a31017e..c17790bd320 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -1,10 +1,25 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +file(GLOB TEST_DIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_collective_*.py") +string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}") if (WITH_MLU) + foreach(TEST_OP ${TEST_DIST_OPS}) + LIST(REMOVE_ITEM TEST_OPS ${TEST_OP}) + endforeach(TEST_OP) + foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) - set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) - set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + + if(WITH_CNCL) + foreach(TEST_OP ${TEST_DIST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + endforeach(TEST_OP) + bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + endif(WITH_CNCL) endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py new file mode 100644 index 00000000000..9ea550a8452 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import paddle.fluid as fluid + + +def train(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + + +def train_abort(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + if trainer_id == 0: + try: + # train abort + exit(1) + except SystemExit: + name = "abort>>> selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + print(name) + with open( + "multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + raise + else: + # sleep 30s to make sure paddle.distributed.launch will terminate this process + time.sleep(30) + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + + +if __name__ == '__main__': + if len(sys.argv) == 3 and sys.argv[2] == "abort": + prefix = sys.argv[1] + train_abort(prefix) + else: + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py new file mode 100644 index 00000000000..9b2713532e4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + + +def train(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f: + f.write(name) + + +if __name__ == '__main__': + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh new file mode 100644 index 00000000000..adf30191861 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud +unset PADDLE_PORT +export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 +export cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export TRAINER_PORTS_NUM=2 + +file_0="multi_process_fullpath_launch.check_0.log" +file_1="multi_process_fullpath_launch.check_1.log" + +distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog" + +echo "paddle.distributed.fleet.launch async poll process test" +if ! MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fullpath_launch abort; then + echo "train abort as planned" +fi + +abort_str1="abort>>> selected_mlus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" + +if grep -q "$abort_str1" "$file_0"; then + echo "trainer 0 abort as planned" +else + echo "trainer 0 not abort as planned" + exit -1 +fi + +if [ ! -f $file_1 ]; then + echo "trainer 1 terminate as planned" +else + echo "trainer 1 not terminate as planned" + rm $file_1 + exit -1 +fi + +if [ -f $file_0 ]; then + rm $file_0 +fi diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh new file mode 100644 index 00000000000..b93b21c1bdf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +# use paddlecloud +echo "begin test use paddlecloud" +cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=2 + +distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog" +MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fleetlaunchcloud + +str1="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" +str2="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +file_0="multi_process_fleetlaunchcloud.check_0.log" +file_1="multi_process_fleetlaunchcloud.check_1.log" + +echo "paddlecloud params test" +if grep -q "$str1" "$file_0"; then + echo "find trainer 0" +else + echo "not find trainer 0" + exit -1 +fi + +if grep -q "$str2" "$file_1"; then + echo "find trainer 1" +else + echo "not find trainer 1" + exit -1 +fi + +# test async poll process +if [ -f $file_0 ]; then + rm $file_0 +fi +if [ -f $file_1 ]; then + rm $file_1 +fi diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh new file mode 100644 index 00000000000..722590dc87f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +export FLAGS_START_PORT=35789 + +export MLU_VISIBLE_DEVICES=0,1 + +function test_nproc_0(){ + mlus=$1 + file_0="fleet_nproc_0.check_0.log" + rm -f ${file_0} + distributed_args="--log_dir=testlog --nproc_per_node=1" + python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py fleet_nproc_0 + + str0="selected_mlus:${mlus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0" + if grep -q "$str0" "$file_0"; then + echo "find trainer 0" + else + echo "not find trainer 0" + exit -1 + fi + if [ -f $file_0 ]; then + rm $file_0 + fi +} + + +function test_nproc_1(){ + file_0="fleet_nproc_1.check_0.log" + file_1="fleet_nproc_1.check_1.log" + rm -f ${file_0} ${file_1} + + distributed_args="--log_dir=testlog --nproc_per_node=2" + python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py fleet_nproc_1 + + str0="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0" + if grep -q "$str0" "$file_0"; then + echo "find trainer 0" + else + echo "not find trainer 0" + exit -1 + fi + + str1="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1" + if grep -q "$str1" "$file_1"; then + echo "find trainer 1" + else + echo "not find trainer 1" + exit -1 + fi + if [ -f $file_0 ]; then + rm $file_0 + fi + if [ -f $file_1 ]; then + rm $file_1 + fi +} + +test_nproc_0 "0,1" + +test_nproc_1 -- GitLab