未验证 提交 6af531b7 编写于 作者: X xiayanming 提交者: GitHub

fleet support elastic scale up/down (#36684)

* fleet support elastic train

* fleet support elastic train

* support elastic

* add unittest

* fix unitest bug

* fix unittest bug

* fix unittest bug

* fix unittest coverage

* fix unittest coverage

* fix unittest coverage

* fix unittest coverage

* fix unittest coverage

* fix elastic bug

* fix ci fail

* fix ci fail

* fix elastic bug

* fix elastic bug

* fix joint debugging bug

* fix joint debugging bug

* fix windows ci failed

* fix windows ci failed
上级 9a9345fa
...@@ -50,7 +50,10 @@ if __name__ == '__main__': ...@@ -50,7 +50,10 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
"--elastic_server", type=str, help="etcd server host:port") "--elastic_server", type=str, help="etcd server host:port")
parser.add_argument("--job_id", type=str, help="job unique id") parser.add_argument("--job_id", type=str, help="job unique id")
parser.add_argument("--np", type=int, help="job pod/node number") parser.add_argument(
"--np",
type=str,
help="job pod/node number, need to be 'MIN' or 'MIN:MAX' format")
parser.add_argument("action", type=str, help="action to take") parser.add_argument("action", type=str, help="action to take")
args = parser.parse_args() args = parser.parse_args()
...@@ -58,7 +61,7 @@ if __name__ == '__main__': ...@@ -58,7 +61,7 @@ if __name__ == '__main__':
server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER') server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID') name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID')
np = args.np or int(os.getenv('PADDLE_ELASTIC_NP', 0)) np = int(args.np.split(":")[0]) or int(os.getenv('PADDLE_ELASTIC_NP', 0))
cmd = Command(server, name) cmd = Command(server, name)
......
...@@ -33,7 +33,7 @@ def enable_elastic(args, distribute_mode): ...@@ -33,7 +33,7 @@ def enable_elastic(args, distribute_mode):
if not args.job_id and not os.getenv('PADDLE_ELASTIC_JOB_ID'): if not args.job_id and not os.getenv('PADDLE_ELASTIC_JOB_ID'):
return False return False
if not args.np and not int(os.getenv('PADDLE_ELASTIC_NP', 0)): if not args.np and not os.getenv('PADDLE_ELASTIC_NP'):
return False return False
return True return True
...@@ -41,7 +41,11 @@ def enable_elastic(args, distribute_mode): ...@@ -41,7 +41,11 @@ def enable_elastic(args, distribute_mode):
def launch_elastic(args, distribute_mode): def launch_elastic(args, distribute_mode):
elastic = ElasticManager(args) server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
srv, port = server.split(':')
import etcd3
etcd_client = etcd3.client(host=srv, port=port)
elastic = ElasticManager(args, etcd_client)
signal.signal(signal.SIGTERM, elastic.signal_handler) signal.signal(signal.SIGTERM, elastic.signal_handler)
signal.signal(signal.SIGABRT, elastic.signal_handler) signal.signal(signal.SIGABRT, elastic.signal_handler)
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import tempfile
from paddle.distributed.fleet import launch_utils from paddle.distributed.fleet import launch_utils
from paddle.distributed.fleet import cloud_utils from paddle.distributed.fleet import cloud_utils
from paddle.distributed.fleet import ascend_utils from paddle.distributed.fleet import ascend_utils
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import time
import unittest
import argparse
from warnings import catch_warnings
from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
from paddle.distributed.fleet.launch_utils import DistributeMode
class TestElasticInit(unittest.TestCase):
def setUp(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2:4"
self.args = Argument()
def test_enable_elastic(self):
result = enable_elastic(self.args, DistributeMode.COLLECTIVE)
self.assertEqual(result, True)
def test_launch_elastic(self):
try:
launch_elastic(self.args, DistributeMode.COLLECTIVE)
except Exception as e:
pass
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import time
import unittest
import argparse
from paddle.distributed.fleet.elastic.manager import ElasticManager
from paddle.distributed.fleet.elastic.manager import ELASTIC_TIMEOUT
class MockLease():
def refresh(self):
pass
class MockEtcdClient:
def __init__(self, lease=None):
self._lease = lease
def put(self, key, value, lease=None):
pass
def get(self, key):
value = "0"
return value, value
def delete_prefix(self, key):
pass
def get_prefix(self, key_prefix):
hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
return hosts
def add_watch_callback(self, *args, **kwargs):
return "host_watch"
def add_watch_prefix_callback(self, key_prefix, callback, **kwargs):
callback(None)
return "host_watch"
def cancel_watch(self, watch_id):
pass
def delete(self, key):
pass
def lease(self, ttl):
if self._lease:
return self._lease
else:
return MockLease()
class TestElasticManager(unittest.TestCase):
def setUp(self):
self.etcd_client = MockEtcdClient()
def test_elastic_manager_init(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
args = Argument()
class _MockLease():
def refresh(self):
raise ValueError("valid error, this only for unittest")
etcd_client = MockEtcdClient(lease=_MockLease())
elastic = ElasticManager(args, etcd_client=etcd_client)
def test_match_faulttolerance(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
args = Argument()
elastic = ElasticManager(args, self.etcd_client)
hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
self.assertEqual(elastic._match(hosts), True)
hosts = ["10.10.10.1:6001"]
os.environ['PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001"
self.assertEqual(elastic._match(hosts), False)
def test_match_elastic(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2:4"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
os.environ['PADDLE_ELASTIC_TIMEOUT'] = "60"
args = Argument()
os.environ['FLAGS_START_PORT'] = "6001"
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001,10.10.10.4:6001"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001,10.10.10.4:6001"
elastic = ElasticManager(args, self.etcd_client)
hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
self.assertEqual(elastic._match(hosts), False)
hosts = [
"10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001",
"10.10.10.4:6001"
]
self.assertEqual(elastic._match(hosts), True)
hosts = ["10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"]
self.assertEqual(elastic._match(hosts), False)
hosts = ["10.10.10.1:6001"]
self.assertEqual(elastic._match(hosts), False)
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
elastic = ElasticManager(args, self.etcd_client)
hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
self.assertEqual(elastic._match(hosts), True)
# TODO test timeout
#time.sleep(60)
#self.assertEqual(elastic._match(hosts), True)
def test_update_hosts_for_faulttolerance(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "0"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
args = Argument()
os.environ['FLAGS_START_PORT'] = "6001"
os.environ['PADDLE_ELASTIC_NP'] = "2"
os.environ['PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.2"
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
elastic = ElasticManager(args, self.etcd_client)
# add 10.10.10.3:6001
os.environ['PADDLE_TRAINER_ID'] = "0"
elastic.host_port = "10.10.10.1:6001"
elastic.hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
elastic._update_hosts()
self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.2")
# add 10.10.10.3:6001
elastic.host_port = "10.10.10.3:6001"
elastic.hosts = ["10.10.10.1:6001", "10.10.10.3:6001"]
os.environ['PADDLE_TRAINER_ID'] = "1"
elastic._update_hosts()
self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.3")
elastic.host_port = "10.10.10.3:6001"
elastic.hosts = ["10.10.10.1:6001", "10.10.10.3:6001"]
os.environ['PADDLE_TRAINER_ID'] = "-1"
elastic._update_hosts()
self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.3")
def test_update_hosts_for_elastic(self):
#######################
# elastic, scale up #
#######################
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2:4"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
args = Argument()
os.environ['FLAGS_START_PORT'] = "6001"
os.environ['PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.2"
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
elastic = ElasticManager(args, self.etcd_client)
# add 10.10.10.3:6001
elastic.host_port = "10.10.10.1:6001"
elastic.hosts = [
"10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"
]
elastic._update_hosts()
#self.assertEqual(elastic.all_host_endpoints,
# ["10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"])
self.assertEqual(
os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.2,10.10.10.3")
#######################
# elastic, scale down #
#######################
os.environ[
'PADDLE_TRAINERS'] = "10.10.10.0,10.10.10.1,10.10.10.2,10.10.10.3"
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.0:6000,10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.0:6000,10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001"
elastic = ElasticManager(args, self.etcd_client)
# remove 10.10.10.1:6001
elastic.host_port = "10.10.10.1:6001"
elastic.hosts = [
"10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"
]
elastic._update_hosts()
#self.assertEqual(elastic.all_host_endpoints,
# ["10.10.10.3:6001", "10.10.10.1:6001", "10.10.10.2:6001"])
self.assertEqual(
os.getenv('PADDLE_TRAINERS'), "10.10.10.3,10.10.10.1,10.10.10.2")
self.assertEqual(
os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
"10.10.10.3:6001,10.10.10.1:6001,10.10.10.2:6001")
############
os.environ['PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.1"
os.environ[
'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.1:6002,10.10.10.1:6003,10.10.10.1:6004"
os.environ[
'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.1:6002,10.10.10.1:6003,10.10.10.1:6004"
elastic = ElasticManager(args, self.etcd_client)
# remove 10.10.10.1:6001
elastic.host_port = "10.10.10.1:6001"
os.environ['PADDLE_TRAINER_ID'] = "-1"
elastic.hosts = ["10.10.10.1:6001", "10.10.10.1:6001"]
elastic._update_hosts()
#self.assertEqual(elastic.all_host_endpoints,
# ["10.10.10.1:6001", "10.10.10.1:6001"])
self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.1")
self.assertEqual(
os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
"10.10.10.1:6001,10.10.10.1:6001")
def test_exit(self):
class Argument:
elastic_server = "127.0.0.1:2379"
job_id = "test_job_id_123"
np = "2"
gpus = "0"
nproc_per_node = 1
host = None
host_port = None
scale = None
force = None
backend = 'gloo'
args = Argument()
elastic = ElasticManager(args, self.etcd_client)
elastic.exit()
if __name__ == "__main__":
unittest.main()
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
echo "begin test elastic" echo "begin test elastic"
unset GREP_OPTIONS unset GREP_OPTIONS
rm -rf log rm -rf log*
pids=`ps -ef | grep "python -m paddle.distributed.launch elastic_demo.[py]" | awk '{print $2}'` pids=`ps -ef | grep "python -m paddle.distributed.launch elastic_demo.[py]" | awk '{print $2}'`
if [ -n "$pids" ]; then if [ -n "$pids" ]; then
...@@ -28,6 +28,11 @@ fi ...@@ -28,6 +28,11 @@ fi
python -m pip install --no-cache-dir etcd3 -i https://mirror.baidu.com/pypi/simple python -m pip install --no-cache-dir etcd3 -i https://mirror.baidu.com/pypi/simple
#############################
#### test fault tolrance ####
#############################
# common env # common env
export PADDLE_ELASTIC_NP=2 export PADDLE_ELASTIC_NP=2
export PADDLE_ELASTIC_SERVER=127.0.0.1:2379 export PADDLE_ELASTIC_SERVER=127.0.0.1:2379
...@@ -137,7 +142,7 @@ export PADDLE_TRAINER_ID=1 ...@@ -137,7 +142,7 @@ export PADDLE_TRAINER_ID=1
export PADDLE_TRAINERS_NUM=2 export PADDLE_TRAINERS_NUM=2
python -m paddle.distributed.launch elastic_demo.py &> log_1.log & python -m paddle.distributed.launch elastic_demo.py &> log_1.log &
p1=$! p1_1=$!
for i in {1..10} for i in {1..10}
do do
...@@ -184,7 +189,7 @@ export PADDLE_TRAINER_ID=0 ...@@ -184,7 +189,7 @@ export PADDLE_TRAINER_ID=0
export PADDLE_TRAINERS_NUM=2 export PADDLE_TRAINERS_NUM=2
python -m paddle.distributed.launch elastic_demo.py &> log_0.log & python -m paddle.distributed.launch elastic_demo.py &> log_0.log &
p0=$! p0_1=$!
for i in {1..10} for i in {1..10}
do do
...@@ -205,4 +210,102 @@ check_env ...@@ -205,4 +210,102 @@ check_env
echo "All check done" echo "All check done"
sleep 3 sleep 3
kill $p0 $p1 kill $p0 $p1 $p0_1 $p1_1
#############################
##### test elastic #####
#############################
# common env
export PADDLE_ELASTIC_NP=2:4
export PADDLE_ELASTIC_SERVER=127.0.0.1:2379
export PADDLE_ELASTIC_JOB_ID=elastic-demo-2
# run node 0
export NVIDIA_VISIBLE_DEVICES=0
export CUDA_VISIBLE_DEVICES=0
export DISTRIBUTED_TRAINER_ENDPOINTS=10.10.10.1:8001,10.10.10.2:8001,10.10.10.3:8001
export PADDLE_TRAINERS=10.10.10.1,10.10.10.2,10.10.10.3
export TRAINER_PORTS_NUM=1
export POD_IP=10.10.10.1
export PADDLE_TRAINER_ID=0
export PADDLE_TRAINERS_NUM=3
python -m paddle.distributed.launch elastic_demo.py &> log_pe_0.log &
pe_0=$!
for i in {1..10}
do
if grep -q "INFO:ELASTIC:not ready" log_pe_0.log; then
echo "run node 0 ok"
break
else
sleep 10
fi
if [ $i -eq 10 ]; then
echo "run node 0 error"
exit -1
fi
done
# run node 1
export NVIDIA_VISIBLE_DEVICES=1
export CUDA_VISIBLE_DEVICES=1
export DISTRIBUTED_TRAINER_ENDPOINTS=10.10.10.1:8001,10.10.10.2:8001,10.10.10.3:8001
export PADDLE_TRAINERS=10.10.10.1,10.10.10.2,10.10.10.3
export TRAINER_PORTS_NUM=1
export POD_IP=10.10.10.2
export PADDLE_TRAINER_ID=1
export PADDLE_TRAINERS_NUM=3
python -m paddle.distributed.launch elastic_demo.py &> log_pe_1.log &
pe_1=$!
for i in {1..10}
do
if grep -q "INFO:ELASTIC:not ready" log_pe_1.log; then
echo "run node 1 ok"
break
else
sleep 10
fi
if [ $i -eq 10 ]; then
echo "run node 1 error"
exit -1
fi
done
# run node 2
export NVIDIA_VISIBLE_DEVICES=1
export CUDA_VISIBLE_DEVICES=1
export DISTRIBUTED_TRAINER_ENDPOINTS=10.10.10.1:8001,10.10.10.2:8001,10.10.10.3:8001
export PADDLE_TRAINERS=10.10.10.1,10.10.10.2,10.10.10.3
export TRAINER_PORTS_NUM=1
export POD_IP=10.10.10.3
export PADDLE_TRAINER_ID=2
export PADDLE_TRAINERS_NUM=3
python -m paddle.distributed.launch elastic_demo.py &> log_pe_2.log &
pe_2=$!
for i in {1..10}
do
if grep -q "INFO:ELASTIC:ready with hosts" log_pe_2.log; then
echo "run node 2 ok"
break
else
sleep 10
fi
if [ $i -eq 10 ]; then
echo "run node 2 error"
exit -1
fi
done
lw0="log/workerlog.0"
check_env
echo "All check done"
sleep 3
kill $pe_0 $pe_1 $pe_2
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册