diff --git a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc index 4da889b400071f9bb4bb6476d25e2ba5957ea2ee..715ca97f3715128c6d2ccfcbb8d291f84f176a6d 100644 --- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc @@ -42,6 +42,9 @@ DEFINE_int32( namespace paddle { namespace framework { namespace ir { +// unit of the FLAGS_fuse_parameter_memory_size. +static constexpr double kMB = 1048576.0; + // SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit // test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size' // and 'FLAGS_fuse_parameter_groups_size' in unit test. @@ -228,8 +231,8 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } VLOG(10) << out.str() << ", group size:" << group_grads_params->at(i).size() - << ", group memory size:" - << static_cast(gps_size) / 1048576.0 << "(MB)"; + << ", group memory size:" << static_cast(gps_size) / kMB + << "(MB)"; } } @@ -270,7 +273,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { break; } - if (static_cast(local_group_memory_size) / 1048576.0 >= + if (static_cast(local_group_memory_size) / kMB >= group_memory_size) { break; } diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index b685723763833321bba1c70bec32cca52268d728..06369ea6b701ec2edac781f56dd76a20cff6e6e4 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -164,6 +164,13 @@ def start_procs(args): ", node_ips:", node_ips, ", nranks:", nranks) current_env = copy.copy(default_env) + # paddle broadcast ncclUniqueId use socket, and + # proxy maybe make trainers unreachable, so delete them. + # if we set them to "", grpc will log error message "bad uri" + # so just delete them. + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + procs = [] cmds = [] for i in range(0, selected_gpus_num): @@ -173,11 +180,7 @@ def start_procs(args): "PADDLE_CURRENT_ENDPOINT": "%s:%d" % (current_node_ip, args.started_port + i), "PADDLE_TRAINERS_NUM": "%d" % nranks, - "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints, - # paddle broadcast ncclUniqueId use socket, and - # proxy maybe make trainers unreachable, so set them to "" - "http_proxy": "", - "https_proxy": "" + "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints }) cmd = [sys.executable, "-u", args.training_script