未验证 提交 24a063f6 编写于 作者: G gongweibao 提交者: GitHub

Add fleet checkpoint on local fs and remote fs(such as hdfs) for EDL (#22586)

上级 0c23e3ff
cc_library(fs SRCS fs.cc DEPS string_helper glog boost) cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
cc_library(shell SRCS shell.cc DEPS string_helper glog) cc_library(shell SRCS shell.cc DEPS string_helper glog timer)
cc_test(test_fs SRCS test_fs.cc DEPS fs shell) cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/io/shell.h" #include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -296,23 +298,48 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open( ...@@ -296,23 +298,48 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
#endif #endif
} }
std::string shell_get_command_output(const std::string& cmd) { std::string shell_get_command_output(const std::string& cmd, int time_out,
int sleep_inter, bool print_cmd) {
#if defined _WIN32 || defined __APPLE__ #if defined _WIN32 || defined __APPLE__
return ""; PADDLE_THROW(platform::errors::Unimplemented(
"This function(shell_get_command_output) is not implemented under _WIN32 "
"or __APPLE__."));
#else #else
int err_no = 0; int err_no = 0;
platform::Timer timer;
do { do {
if (print_cmd) {
LOG(INFO) << "exec cmd:[" << cmd << "]";
}
err_no = 0; err_no = 0;
std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no); std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no);
string::LineFileReader reader; string::LineFileReader reader;
if (reader.getdelim(&*pipe, 0)) { char* buf = reader.getdelim(&*pipe, 0);
pipe = nullptr; if (err_no == 0) {
if (err_no == 0) { if (buf) {
return reader.get(); return reader.get();
} }
return "";
}
if (sleep_inter > 0) {
usleep(sleep_inter);
} }
} while (err_no == -1);
timer.Pause();
if (time_out > 0 && timer.ElapsedMS() >= time_out) {
PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
"shell_get_command_output execute error errno:%d and try until "
"timeout.",
errno));
return "";
}
timer.Resume();
pipe = nullptr;
} while (err_no);
return ""; return "";
#endif #endif
} }
......
...@@ -65,7 +65,12 @@ inline void shell_execute(const std::string& cmd) { ...@@ -65,7 +65,12 @@ inline void shell_execute(const std::string& cmd) {
} while (err_no == -1); } while (err_no == -1);
} }
extern std::string shell_get_command_output(const std::string& cmd); // timeout:ms, default -1 means forever.
// sleep_inter:ms, default -1 means not sleep.
extern std::string shell_get_command_output(const std::string& cmd,
int time_out = -1,
int sleep_inter = -1,
bool print_cmd = false);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -1494,8 +1494,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1494,8 +1494,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST); m.def("is_compiled_with_dist", IsCompiledWithDIST);
m.def("run_cmd", [](const std::string &cmd) -> const std::string { m.def("run_cmd", [](const std::string &cmd, int time_out = -1,
return paddle::framework::shell_get_command_output(cmd); int sleep_inter = -1) -> const std::string {
return paddle::framework::shell_get_command_output(cmd, time_out,
sleep_inter);
}); });
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from utils import get_cluster, logger
import os
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
"""
args_node_ips, args_node_ip:string
"""
#you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips = os.getenv("PADDLE_TRAINERS")
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
node_ip = os.getenv("POD_IP")
assert node_ip is not None, "POD_IP should not be None"
node_rank = os.getenv("PADDLE_TRAINER_ID")
assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
node_ips = node_ips.split(",")
num_nodes = len(node_ips)
node_rank = int(node_rank)
if node_ip != "127.0.0.1" and node_ip != args_node_ip:
logger.warning("Please NOTE: When using paddlecloud, node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))
if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
logger.warning(
"Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args_node_ips, node_ips))
started_port = args_port
print("num_nodes:", num_nodes)
if num_nodes > 1:
try:
paddle_port = int(os.getenv("PADDLE_PORT", ""))
paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
if paddle_port_num >= len(
selected_gpus) and paddle_port != args_port:
logger.warning("Use Cloud specified port:{}.".format(
paddle_port))
started_port = paddle_port
except Exception as e:
print(e)
pass
if started_port is None:
started_port = 6170
logger.debug("parsed from args:node_ips:{} \
node_ip:{} node_rank:{} started_port:{}"
.format(node_ips, node_ip, node_rank, started_port))
ports = [x for x in range(started_port, started_port + len(selected_gpus))]
cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
return cluster, cluster.pods[node_rank]
def get_trainers_num():
return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import sys
import abc
import os
from pathlib import PurePosixPath
import shutil
class FS(object):
@abc.abstractmethod
def list_dirs(self, fs_path):
pass
@abc.abstractmethod
def ls_dir(self, fs_path):
pass
@abc.abstractmethod
def stat(self, fs_path):
pass
@abc.abstractmethod
def upload(self, local_path, fs_path):
pass
@abc.abstractmethod
def download(self, fs_path, local_path):
pass
@abc.abstractmethod
def mkdir(self, fs_path):
pass
@abc.abstractmethod
def mv(self, fs_src_path, fs_dst_path):
pass
@abc.abstractmethod
def rmr(self, fs_path):
pass
@abc.abstractmethod
def rm(self, fs_path):
pass
@abc.abstractmethod
def delete(self, fs_path):
pass
@abc.abstractmethod
def need_upload_download(self):
pass
class LocalFS(FS):
def list_dirs(self, fs_path):
if not self.stat(fs_path):
return []
return [
f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
]
def ls_dir(self, fs_path):
return [f for f in os.listdir(fs_path)]
def stat(self, fs_path):
return os.path.exists(fs_path)
def mkdir(self, fs_path):
assert not os.path.isfile(fs_path), "{} is already a file".format(
fs_path)
os.system("mkdir -p {}".format(fs_path))
def mv(self, fs_src_path, fs_dst_path):
os.rename(fs_src_path, fs_dst_path)
def rmr(self, fs_path):
shutil.rmtree(fs_path)
def rm(self, fs_path):
os.remove(fs_path)
def delete(self, fs_path):
if not self.stat(fs_path):
return
if os.path.isfile(fs_path):
return self.rm(fs_path)
return self.rmr(fs_path)
def need_upload_download(self):
return False
class BDFS(FS):
def __init__(self,
hdfs_name,
hdfs_ugi,
time_out=20 * 60 * 1000,
sleep_inter=1000):
self._base_cmd = "hadoop fs -Dfs.default.name=\"{}\" -Dhadoop.job.ugi=\"{}\"".format(
hdfs_name, hdfs_ugi)
self._time_out = time_out
self._sleep_inter = sleep_inter
def _run_cmd(self, cmd):
ret = fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
if len(ret) <= 0:
return []
lines = ret.splitlines()
return lines
def list_dirs(self, fs_path):
if not self.stat(fs_path):
return []
dirs, _ = self.ls_dir(fs_path)
return dirs
def ls_dir(self, fs_path):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
cmd = "{} -ls {}".format(self._base_cmd, fs_path)
lines = self._run_cmd(cmd)
dirs = []
files = []
for line in lines:
arr = line.split()
if len(arr) != 8:
continue
if fs_path not in arr[7]:
continue
p = PurePosixPath(arr[7])
if arr[0][0] == 'd':
dirs.append(p.name)
else:
files.append(p.name)
return dirs, files
def is_dir(self, fs_path):
cmd = "{} -test -d {} ; echo $?".format(self._base_cmd, fs_path)
test = self._run_cmd(cmd)
if test[0].strip() == "0":
return True
return False
def stat(self, fs_path):
cmd = "{} -test -e {} ; echo $?".format(self._base_cmd, fs_path)
test = self._run_cmd(cmd)
if test[0].strip() == "0":
return True
return False
def upload(self, local_path, fs_path):
cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def download(self, fs_path, local_path):
cmd = "{} -get {} {}/".format(self._base_cmd, fs_path, local_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def mkdir(self, fs_path):
if not self.stat(fs_path):
cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def mv(self, fs_src_path, fs_dst_path):
cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def rmr(self, fs_path):
if not self.stat(fs_path):
return
cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def rm(self, fs_path):
if not self.stat(fs_path):
return
cmd = "{} -rm {}".format(self._base_cmd, fs_path)
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def delete(self, fs_path):
if not self.stat(fs_path):
return
is_dir = self.is_dir(fs_path)
if is_dir:
return self.rmr(fs_path)
return self.rm(fs_path)
def need_upload_download(self):
return True
...@@ -36,7 +36,6 @@ launch a process on each of the given gpu card. ...@@ -36,7 +36,6 @@ launch a process on each of the given gpu card.
""" """
from __future__ import print_function from __future__ import print_function
import logging
import sys import sys
from sys import version from sys import version
import subprocess import subprocess
...@@ -45,17 +44,11 @@ import time ...@@ -45,17 +44,11 @@ import time
import six import six
import copy import copy
from argparse import ArgumentParser, REMAINDER from argparse import ArgumentParser, REMAINDER
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from contextlib import closing
import socket
logger = logging.getLogger() from paddle.distributed.utils import *
logger.setLevel(logging.INFO) import paddle.distributed.cloud_utils as cloud_utils
log_handler = logging.StreamHandler()
log_format = logging.Formatter(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
log_handler.setFormatter(log_format)
logger.addHandler(log_handler)
def _print_arguments(args): def _print_arguments(args):
...@@ -65,32 +58,6 @@ def _print_arguments(args): ...@@ -65,32 +58,6 @@ def _print_arguments(args):
print("------------------------------------------------") print("------------------------------------------------")
def find_free_ports(num):
def __free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
port_set = set()
step = 0
while True:
port = __free_port()
if port not in port_set:
port_set.add(port)
if len(port_set) >= num:
return port_set
step += 1
if step > 100:
print(
"can't find avilable port and use the specified static port now!"
)
return None
return None
def _parse_args(): def _parse_args():
""" """
Helper function parsing the command line options Helper function parsing the command line options
...@@ -146,6 +113,12 @@ POD_IP (current node ip address, not needed for local training) ...@@ -146,6 +113,12 @@ POD_IP (current node ip address, not needed for local training)
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training." "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
) )
parser.add_argument(
"--log_level",
type=int,
default=20, # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
help="Logging level, default is logging.INFO")
parser.add_argument( parser.add_argument(
"--log_dir", "--log_dir",
type=str, type=str,
...@@ -166,196 +139,97 @@ POD_IP (current node ip address, not needed for local training) ...@@ -166,196 +139,97 @@ POD_IP (current node ip address, not needed for local training)
return parser.parse_args() return parser.parse_args()
def terminate_procs(procs): def get_cluster_from_args(args, selected_gpus):
for p in procs: node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
if p.poll() is None: node_ip = args.node_ip
p.terminate() node_rank = node_ips.index(node_ip)
logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
node_ips, node_ip, node_rank))
free_ports = None
if not args.use_paddlecloud and len(
node_ips) <= 1 and args.started_port is None:
free_ports = find_free_ports(len(selected_gpus))
if free_ports is not None:
free_ports = list(free_ports)
else:
free_ports = [
x
for x in range(args.started_port, args.started_port + len(
selected_gpus))
]
def start_procs(args): return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
"""
"""
default_env = os.environ.copy()
current_node_ip = args.node_ip
node_ips = [x.strip() for x in args.cluster_node_ips.split(',')] def get_gpus(selected_gpus):
node_id = node_ips.index(current_node_ip) if selected_gpus is None:
if args.use_paddlecloud:
trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
if trainer_nums != 1:
#you can automatically get ip info while using paddlecloud multi nodes mode.
current_node_ip = os.getenv("POD_IP")
assert current_node_ip is not None, "POD_IP should not be None"
node_ips = os.getenv("PADDLE_TRAINERS")
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
node_ips = node_ips.split(",")
node_id = os.getenv("PADDLE_TRAINER_ID")
assert node_id is not None, "PADDLE_TRAINER_ID should not be None"
node_id = int(node_id)
if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip:
logger.warning(
"Please NOTE: When using paddlecloud, current_node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
current_node_ip: {} from paddlecloud environment."
.format(args.node_ip, current_node_ip))
if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join(
node_ips):
logger.warning(
"Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args.cluster_node_ips, node_ips))
num_nodes = len(node_ips)
if args.selected_gpus is None:
gpus_num = fluid.core.get_cuda_device_count() gpus_num = fluid.core.get_cuda_device_count()
selected_gpus = [str(x) for x in range(0, gpus_num)] selected_gpus = [str(x) for x in range(0, gpus_num)]
else: else:
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "": if cuda_visible_devices is None or cuda_visible_devices == "":
selected_gpus = [x.strip() for x in args.selected_gpus.split(',')] selected_gpus = [x.strip() for x in selected_gpus.split(',')]
else: else:
# change selected_gpus into relative values # change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3 # therefore selected_gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',') cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in args.selected_gpus.split(','): for x in selected_gpus.split(','):
assert x in cuda_visible_devices_list, "Can't find "\ assert x in cuda_visible_devices_list, "Can't find "\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices) % (x, cuda_visible_devices)
selected_gpus = [ selected_gpus = [
cuda_visible_devices_list.index(x.strip()) cuda_visible_devices_list.index(x.strip())
for x in args.selected_gpus.split(',') for x in selected_gpus.split(',')
] ]
selected_gpus_num = len(selected_gpus)
if args.use_paddlecloud and num_nodes > 1:
cloud_paddle_port = os.getenv("PADDLE_PORT", "")
cloud_paddle_port_num = os.getenv("PADDLE_PORTS_NUM", "")
if cloud_paddle_port != "" and cloud_paddle_port_num != "":
cloud_paddle_port_num = int(cloud_paddle_port_num)
if cloud_paddle_port_num >= selected_gpus_num:
args.started_port = int(cloud_paddle_port)
logger.warning("Use Cloud specified port:{}.".format(
cloud_paddle_port))
free_ports = None return selected_gpus
if not args.use_paddlecloud and num_nodes <= 1 and args.started_port is None:
free_ports = find_free_ports(selected_gpus_num)
if free_ports is not None:
free_ports = list(free_ports)
args.started_port = free_ports[0]
if args.started_port is None:
args.started_port = 6170
if free_ports is None: def launch(args):
free_ports = [ # parse arguments, used for cloud-single-machine and local
x selected_gpus = get_gpus(args.selected_gpus)
for x in range(args.started_port, args.started_port + trainers_num = cloud_utils.get_trainers_num()
selected_gpus_num) logger.debug("parsed from args trainerss_num:{} selected_gpus:{}".format(
] trainers_num, selected_gpus))
trainers_endpoints = "" cluster = None
for ip in node_ips: pod = None
for i in range(0, selected_gpus_num):
if trainers_endpoints != "":
trainers_endpoints += ","
trainers_endpoints += "%s:%d" % (ip, free_ports[i])
nranks = num_nodes * selected_gpus_num if args.use_paddlecloud and trainers_num != 1:
cluster, pod = cloud_utils.get_cloud_cluster(
args.cluster_node_ips, args.node_ip, args.started_port,
selected_gpus)
logger.info("get cluster from cloud:{}".format(cluster))
else:
cluster, pod = get_cluster_from_args(args, selected_gpus)
logger.info("get cluster from args:{}".format(cluster))
if args.print_config: procs = start_local_trainers(
print("trainers_endpoints:", trainers_endpoints, ", node_id:", node_id, cluster,
", current_node_ip:", current_node_ip, ", num_nodes:", num_nodes, pod,
", node_ips:", node_ips, ", nranks:", nranks) training_script=args.training_script,
training_script_args=args.training_script_args,
current_env = copy.copy(default_env) log_dir=args.log_dir)
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them. while True:
#if we set them to "", grpc will log error message "bad uri" alive = watch_local_trainers(procs, cluster.trainers_nranks())
#so just delete them.
current_env.pop("http_proxy", None) if not alive:
current_env.pop("https_proxy", None) logger.info("Local procs complete, POD info:{}".format(pod))
break
procs = []
log_fns = [] time.sleep(3)
cmds = []
ranks = []
for i in range(0, selected_gpus_num): if __name__ == "__main__":
rank = (node_id * selected_gpus_num + i)
current_env.update({
"FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % rank,
"PADDLE_CURRENT_ENDPOINT":
"%s:%d" % (current_node_ip, free_ports[i]),
"PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
})
cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args
cmds.append(cmd)
if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc)
ranks.append(rank)
try:
alive = True
error = False
error_rank = []
# wait all process finish or one error
while alive and not error:
alive = False
for rank, p in zip(ranks, procs):
ret = p.poll()
if ret is None:
alive = True
elif ret != 0:
error = True
error_rank.append(rank)
time.sleep(1)
if error:
terminate_procs(procs)
exit(1)
except KeyboardInterrupt:
logger.warning("KeyboardInterrupt, exit")
terminate_procs(procs)
raise
except SystemExit:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs)
raise
except:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs)
raise
finally:
for fn in log_fns:
fn.close()
def launch():
args = _parse_args() args = _parse_args()
logger = get_logger(args.log_level)
if args.print_config: if args.print_config:
_print_arguments(args) _print_arguments(args)
start_procs(args)
launch(args)
if __name__ == "__main__":
launch()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import logging
import socket
import time
import os
import signal
import copy
import sys
import subprocess
from contextlib import closing
import socket
logger = logging.getLogger("root")
logger.propagate = False
class Hdfs(object):
def __init__(self):
self.hdfs_ugi = None
self.hdfs_name = None
self.hdfs_path = None
def is_valid(self):
return self.hdfs_ugi is not None and \
self.hdfs_name is not None and \
self.hdfs_path is not None
def __str__(self):
return "hdfs_ugi:{} hdfs_name:{} hdfs_path{}".format(
self.hdfs_ugi, self.hdfs_name, self.hdfs_path)
def __eq__(self, n):
return self.hdfs_ugi == n.hdfs_ugi and \
self.hdfs_name == n.hdfs_name and \
self.hdfs_path == n.hdfs_path
def __ne__(self, n):
return not self == n
class Cluster(object):
def __init__(self, hdfs):
self.job_server = None
self.pods = []
self.hdfs = None
self.job_stage_flag = None
def __str__(self):
return "job_server:{} pods:{} job_stage_flag:{} hdfs:{}".format(
self.job_server, [str(pod) for pod in self.pods],
self.job_stage_flag, self.hdfs)
def __eq__(self, cluster):
if len(self.pods) != len(cluster.pods):
return False
for a, b in zip(self.pods, cluster.pods):
if a != b:
return False
if self.job_stage_flag != cluster.job_stage_flag:
return False
return True
def __ne__(self, cluster):
return not self.__eq__(cluster)
def update_pods(cluster):
self.pods = copy.copy(cluster.pods)
def trainers_nranks(self):
return len(self.trainers_endpoints())
def pods_nranks(self):
return len(self.pods)
def trainers_endpoints(self):
r = []
for pod in self.pods:
for t in pod.trainers:
r.append(t.endpoint)
return r
def pods_endpoints(self):
r = []
for pod in self.pods:
ep = "{}:{}".format(pod.addr, pod.port)
assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format(
ep)
r.append(ep)
return r
def get_pod_by_id(self, pod_id):
for pod in self.pods:
if str(pod_id) == str(pod.id):
return pod
return None
class JobServer(object):
def __init__(self):
self.endpoint = None
def __str__(self):
return "{}".format(self.endpoint)
def __eq__(self, j):
return self.endpint == j.endpoint
def __ne__(self, j):
return not self == j
class Trainer(object):
def __init__(self):
self.gpus = []
self.endpoint = None
self.rank = None
def __str__(self):
return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint,
self.rank)
def __eq__(self, t):
if len(self.gpus) != len(t.gpus):
return False
if self.endpoint != t.endpoint or \
self.rank != t.rank :
return False
for a, b in zip(self.gpus, t.gpus):
if a != b:
return False
return True
def __ne__(self, t):
return not self == t
def rank(self):
return self.rank
class Pod(object):
def __init__(self):
self.rank = None
self.id = None
self.addr = None
self.port = None
self.trainers = []
self.gpus = []
def __str__(self):
return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
self.rank, self.id, self.addr, self.port, self.gpus,
[str(t) for t in self.trainers])
def __eq__(self, pod):
if self.rank != pod.rank or \
self.id != pod.id or \
self.addr != pod.addr or \
self.port != pod.port:
logger.debug("pod {} != pod".format(self, pod))
return False
if len(self.trainers) != len(pod.trainers):
logger.debug("trainers {} != {}".format(self.trainers,
pod.trainers))
return False
for i in range(len(self.trainers)):
if self.trainers[i] != pod.trainers[i]:
logger.debug("trainer {} != {}".format(self.trainers[i],
pod.trainers[i]))
return False
return True
def __ne__(self, pod):
return not self == pod
def parse_response(self, res_pods):
pass
def rank(self):
return self.rank
def get_visible_gpus(self):
r = ""
for g in self.gpus:
r += "{},".format(g)
assert r != "", "this pod {} can't see any gpus".format(self)
r = r[:-1]
return r
def get_logger(log_level, name="root"):
logger = logging.getLogger(name)
logger.setLevel(log_level)
log_handler = logging.StreamHandler()
log_format = logging.Formatter(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
log_handler.setFormatter(log_format)
logger.addHandler(log_handler)
return logger
def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
assert type(paddle_ports) is list, "paddle_ports must be list"
cluster = Cluster(hdfs=None)
trainer_rank = 0
for node_rank, ip in enumerate(node_ips):
pod = Pod()
pod.rank = node_rank
pod.addr = ip
for i in range(len(selected_gpus)):
trainer = Trainer()
trainer.gpus.append(selected_gpus[i])
trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
trainer.rank = trainer_rank
trainer_rank += 1
pod.trainers.append(trainer)
cluster.pods.append(pod)
pod_rank = node_ips.index(node_ip)
return cluster, cluster.pods[pod_rank]
def terminate_local_procs(procs):
for p in procs:
if p.proc.poll() is None:
p.proc.terminate()
p.log_fn.close()
logger.debug("terminate process id:{}".format(p.proc.pid))
# wait all process terminiated
# time.sleep(3)
for step in range(0, 50):
alive = False
for p in procs:
if p.proc.poll() is None: # not termniate
os.kill(p.proc.pid, signal.SIGKILL)
alive = True
if not alive:
logger.info("terminate all the procs")
return
time.sleep(3)
logger.fatal("can't kill all process and exit")
exit(1)
def get_host_name_ip():
try:
host_name = socket.gethostname()
host_ip = socket.gethostbyname(host_name)
return host_name, host_ip
except:
return None
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def find_free_ports(num):
def __free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
port_set = set()
step = 0
while True:
port = __free_port()
if port not in port_set:
port_set.add(port)
if len(port_set) >= num:
return port_set
step += 1
if step > 100:
print(
"can't find avilable port and use the specified static port now!"
)
return None
return None
class TrainerProc(object):
def __init__(self):
self.proc = None
self.log_fn = None
self.rank = None
self.cmd = None
def start_local_trainers(cluster,
pod,
training_script,
training_script_args,
log_dir=None):
current_env = copy.copy(os.environ.copy())
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
procs = []
for idx, t in enumerate(pod.trainers):
proc_env = {
"FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
"PADDLE_TRAINER_ID": "%d" % t.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
}
current_env.update(proc_env)
logger.debug("trainer proc env:{}".format(current_env))
cmd = [sys.executable, "-u", training_script] + training_script_args
logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
fn = None
if log_dir is not None:
os.system("mkdir -p {}".format(log_dir))
fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
tp = TrainerProc()
tp.proc = proc
tp.rank = t.rank
tp.log_fn = fn
tp.cmd = cmd
procs.append(tp)
return procs
def watch_local_trainers(procs, nranks):
try:
error = False
error_rank = []
# wait all process finish or one error
alive = False
for p in procs:
ret = p.proc.poll()
if ret is None:
alive = True
elif ret != 0:
error = True
error_rank.append(p.rank)
if error:
terminate_local_procs(procs)
exit(1)
except KeyboardInterrupt:
logger.warning("KeyboardInterrupt, exit")
terminate_local_procs(procs)
raise
except SystemExit:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
except:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
return alive
...@@ -26,10 +26,14 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode ...@@ -26,10 +26,14 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid import compiler from paddle.fluid import compiler
from paddle.distributed.fs_wrapper import LocalFS, BDFS
import os import os
import sys import sys
import six import six
import json
import re
import shutil
class LambConfig(object): class LambConfig(object):
...@@ -42,6 +46,21 @@ class DistFCConfig(object): ...@@ -42,6 +46,21 @@ class DistFCConfig(object):
pass pass
class TrainStatus(object):
def __init__(self, epoch_no=-1):
# completed epoch
self._epoch_no = epoch_no
def next(self):
return self._epoch_no + 1
def __eq__(self, t):
return self._epoch_no == t._epoch_no
def __ne__(self, t):
return not self == t
class Collective(Fleet): class Collective(Fleet):
def __init__(self): def __init__(self):
super(Collective, self).__init__(Mode.COLLECTIVE) super(Collective, self).__init__(Mode.COLLECTIVE)
...@@ -51,6 +70,8 @@ class Collective(Fleet): ...@@ -51,6 +70,8 @@ class Collective(Fleet):
self._origin_program = None self._origin_program = None
self._transpiled_program = None self._transpiled_program = None
self.main_program = None self.main_program = None
self._checkoint_prefix = "__paddle_fleet_checkpoint__"
self._param_file_name = "_paddle_fleet_param__"
def init_worker(self): def init_worker(self):
logging.warn( logging.warn(
...@@ -103,7 +124,11 @@ class Collective(Fleet): ...@@ -103,7 +124,11 @@ class Collective(Fleet):
executor, main_program, None, None, executor, main_program, None, None,
export_for_deployment) export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None): def save_persistables(self,
executor,
dirname,
main_program=None,
filename=None):
""" """
This function filters out all variables with `persistable==True` from This function filters out all variables with `persistable==True` from
the give `main_program` and then saves these variables to the folder the give `main_program` and then saves these variables to the folder
...@@ -125,7 +150,182 @@ class Collective(Fleet): ...@@ -125,7 +150,182 @@ class Collective(Fleet):
"In fleet.save_inference_model() function, main_program " \ "In fleet.save_inference_model() function, main_program " \
"must be as Program type." "must be as Program type."
io.save_persistables(executor, dirname, main_program, None) io.save_persistables(executor, dirname, main_program, filename=filename)
def _save_train_status(self, path, train_status):
d = {}
d["epoch_no"] = train_status._epoch_no
file_name = "{}/fleet_train_status".format(path)
with open(file_name, 'w') as f:
json.dump(d, f)
def _load_train_status(self, path):
file_name = "{}/fleet_train_status".format(path)
r = TrainStatus()
if not os.path.isfile(file_name):
return r
d = {}
with open(file_name, 'r') as f:
d = json.load(f)
assert "epoch_no" in d, "Can't find epoch_no in dict from train_status file:{}".format(
d)
r._epoch_no = d["epoch_no"]
assert r._epoch_no >= 0, "Data in checkpoint file is not valid:{}".format(
d)
return r
def _get_last_checkpoint_no(self, root_path, fs):
"""
only get the first depth
"""
max_no = -1
d = {}
dirs = fs.list_dirs(root_path)
for dir in dirs:
g = dir.split(".")
if len(g) != 2:
continue
if g[0] != "__paddle_fleet_checkpoint__":
continue
try:
n = int(g[1])
if n > max_no:
max_no = n
except:
continue
return max_no
def clean_redundant_check_points(self,
root_path,
fs=LocalFS(),
checkpoint_num=1):
max_no = self._get_last_checkpoint_no(root_path, fs)
if max_no < 0:
return
if checkpoint_num < 1:
checkpoint_num = 1
dirs = fs.list_dirs(root_path)
for dir in dirs:
g = dir.split(".")
if len(g) != 2:
continue
if g[0] != self._checkoint_prefix:
continue
try:
n = int(g[1])
if n <= max_no - checkpoint_num:
path = "{}/{}.{}".format(root_path, self._checkoint_prefix,
n)
fs.rmr(path)
except Exception as e:
print(e)
continue
def save_check_point(self,
executor,
path,
train_status,
main_program=None,
fs=LocalFS(),
local_cache_path=".cache",
remain_all_checkpoint=True):
"""
This function save persistables and current epoch num to path.
"""
if main_program == None:
main_program = self._transpiled_program
if not fs.stat(path):
fs.mkdir(path)
max_no = self._get_last_checkpoint_no(path, fs=fs)
if max_no < 0:
max_no = -1
real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no + 1)
tmp_path = "{}.tmp".format(real_path)
saved_path = tmp_path
local_fs = LocalFS()
cache_path = None
if fs.need_upload_download():
cache_path = "{}/{}.{}.saved_cache".format(
local_cache_path, self._checkoint_prefix, max_no + 1)
if not local_fs.stat(cache_path):
local_fs.mkdir(cache_path)
saved_path = cache_path
self.save_persistables(
executor=executor,
dirname=saved_path,
main_program=main_program,
filename=self._param_file_name)
self._save_train_status(path=saved_path, train_status=train_status)
if fs.need_upload_download():
fs.delete(tmp_path)
fs.upload(cache_path, tmp_path)
fs.mv(tmp_path, real_path)
if not remain_all_checkpoint:
self.clean_redundant_check_points(path)
def load_check_point(self,
executor,
path,
trainer_id,
main_program=None,
fs=LocalFS(),
local_cache_path=".cache",
ignore_empty=True):
"""
This function load persistables and current epoch num from path.
"""
max_no = self._get_last_checkpoint_no(path, fs)
if not ignore_empty:
assert max_no >= 0, "Can't find checkpoint"
if max_no < 0:
return None
local_fs = LocalFS()
if fs.need_upload_download():
cache_path = "{}/{}.{}.load_cache.{}".format(
local_cache_path, self._checkoint_prefix, max_no, trainer_id)
if local_fs.stat(cache_path):
local_fs.delete(cache_path)
real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no)
load_path = real_path
if fs.need_upload_download():
fs.download(real_path, cache_path)
load_path = cache_path
if main_program == None:
main_program = self._transpiled_program
io.load_persistables(
executor=executor,
dirname=load_path,
main_program=main_program,
filename=self._param_file_name)
return self._load_train_status(load_path)
fleet = Collective() fleet = Collective()
......
...@@ -28,6 +28,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo) ...@@ -28,6 +28,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async) list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync) list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP}) list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach() endforeach()
...@@ -301,6 +302,7 @@ if(WITH_DISTRIBUTE) ...@@ -301,6 +302,7 @@ if(WITH_DISTRIBUTE)
if(WITH_GPU) if(WITH_GPU)
# NOTE. test_launch only work in gpu collective mode # NOTE. test_launch only work in gpu collective mode
bash_test_modules(test_launch MODULES test_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_launch MODULES test_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
endif() endif()
bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
import os
from paddle.distributed.fs_wrapper import LocalFS, BDFS
class FleetTest(unittest.TestCase):
def _test_check_point(self, fs, dir_path):
file_name = "persistables"
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
feeder = fluid.DataFeeder(
feed_list=[image, label], place=fluid.CPUPlace())
predict = fluid.layers.fc(input=image, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=predict, label=label)
avg_loss = fluid.layers.mean(loss)
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
dist_optimizer = fleet.distributed_optimizer(optimizer)
dist_optimizer.minimize(avg_loss)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
status = TrainStatus(2)
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
self.assertEqual(status2, status)
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
self.assertEqual(n2, n1 + 1)
fleet.clean_redundant_check_points(dir_path, fs=fs)
def test_hdfs_check_point(self):
try:
fs = BDFS("xxxx", "xxxx", 1 * 1000, 1 * 1000)
dir_path = "/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
self._test_check_point(fs, dir_path)
except Exception as e:
print(e)
def test_local_check_point(self):
fs = LocalFS()
dir_path = "./my_paddle_model"
self._test_check_point(fs, dir_path)
if __name__ == '__main__':
unittest.main()
...@@ -6,6 +6,7 @@ launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py ...@@ -6,6 +6,7 @@ launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python ${launch_py} multi_process.py python ${launch_py} multi_process.py
# use paddlecloud # use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="10.0.0.1" cluster_node_ips="10.0.0.1"
node_ip="10.0.0.1" node_ip="10.0.0.1"
export PADDLE_TRAINERS_NUM=2 export PADDLE_TRAINERS_NUM=2
...@@ -14,7 +15,7 @@ export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 ...@@ -14,7 +15,7 @@ export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0 export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019 export PADDLE_PORT=35019
export PADDLE_PORTS_NUM=2 export TRAINER_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
...@@ -47,8 +48,9 @@ if [ -f $file_1 ]; then ...@@ -47,8 +48,9 @@ if [ -f $file_1 ]; then
rm $file_1 rm $file_1
fi fi
unset PADDLE_PORT unset PADDLE_PORT
unset PADDLE_PORTS_NUM unset TRAINER_PORTS_NUM
echo "" echo ""
echo "paddle.distributed.launch async poll process test" echo "paddle.distributed.launch async poll process test"
......
...@@ -19,3 +19,4 @@ decorator ...@@ -19,3 +19,4 @@ decorator
prettytable prettytable
objgraph objgraph
astor astor
pathlib
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册