From da9143c1cc91b9b81a9d62a3dcd1b16487513e0e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 11 Jun 2019 13:57:59 +0800 Subject: [PATCH] Polish codes of old prs. (#17938) --- .../ir/alloc_continuous_space_for_grad_pass.cc | 9 ++++++--- python/paddle/distributed/launch.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc index 4da889b400..715ca97f37 100644 --- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc @@ -42,6 +42,9 @@ DEFINE_int32( namespace paddle { namespace framework { namespace ir { +// unit of the FLAGS_fuse_parameter_memory_size. +static constexpr double kMB = 1048576.0; + // SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit // test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size' // and 'FLAGS_fuse_parameter_groups_size' in unit test. @@ -228,8 +231,8 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } VLOG(10) << out.str() << ", group size:" << group_grads_params->at(i).size() - << ", group memory size:" - << static_cast(gps_size) / 1048576.0 << "(MB)"; + << ", group memory size:" << static_cast(gps_size) / kMB + << "(MB)"; } } @@ -270,7 +273,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { break; } - if (static_cast(local_group_memory_size) / 1048576.0 >= + if (static_cast(local_group_memory_size) / kMB >= group_memory_size) { break; } diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index b685723763..06369ea6b7 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -164,6 +164,13 @@ def start_procs(args): ", node_ips:", node_ips, ", nranks:", nranks) current_env = copy.copy(default_env) + # paddle broadcast ncclUniqueId use socket, and + # proxy maybe make trainers unreachable, so delete them. + # if we set them to "", grpc will log error message "bad uri" + # so just delete them. + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + procs = [] cmds = [] for i in range(0, selected_gpus_num): @@ -173,11 +180,7 @@ def start_procs(args): "PADDLE_CURRENT_ENDPOINT": "%s:%d" % (current_node_ip, args.started_port + i), "PADDLE_TRAINERS_NUM": "%d" % nranks, - "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints, - # paddle broadcast ncclUniqueId use socket, and - # proxy maybe make trainers unreachable, so set them to "" - "http_proxy": "", - "https_proxy": "" + "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints }) cmd = [sys.executable, "-u", args.training_script -- GitLab