Polish codes of old prs. (#17938)

da9143c1 · gongweibao · GitHub · bce259e5 · da9143c1 · da9143c1
Showing with 14 addition and 8 deletion

paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc ...luid/framework/ir/alloc_continuous_space_for_grad_pass.cc +6 -3

python/paddle/distributed/launch.py python/paddle/distributed/launch.py +8 -5

未找到文件。
--- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
@@ -42,6 +42,9 @@ DEFINE_int32(
 namespace paddle {
 namespace framework {
 namespace ir {
+// unit of the FLAGS_fuse_parameter_memory_size.
+static constexpr double kMB = 1048576.0;
+
 // SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit
 // test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size'
 // and 'FLAGS_fuse_parameter_groups_size' in unit test.
@@ -228,8 +231,8 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
      }
      VLOG(10) << out.str()
               << ", group size:" << group_grads_params->at(i).size()
-               << ", group memory size:"
-               << static_cast<double>(gps_size) / 1048576.0 << "(MB)";
+               << ", group memory size:" << static_cast<double>(gps_size) / kMB
+               << "(MB)";
    }
  }

@@ -270,7 +273,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
          break;
        }

-        if (static_cast<double>(local_group_memory_size) / 1048576.0 >=
+        if (static_cast<double>(local_group_memory_size) / kMB >=
            group_memory_size) {
          break;
        }

--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -164,6 +164,13 @@ def start_procs(args):
              ", node_ips:", node_ips, ", nranks:", nranks)

    current_env = copy.copy(default_env)
+    # paddle broadcast ncclUniqueId use socket, and
+    # proxy maybe make trainers unreachable, so delete them.
+    # if we set them to "", grpc will log error message "bad uri"
+    # so just delete them.
+    current_env.pop("http_proxy", None)
+    current_env.pop("https_proxy", None)
+
    procs = []
    cmds = []
    for i in range(0, selected_gpus_num):
@@ -173,11 +180,7 @@ def start_procs(args):
            "PADDLE_CURRENT_ENDPOINT":
            "%s:%d" % (current_node_ip, args.started_port + i),
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
-            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints,
-            # paddle broadcast ncclUniqueId use socket, and
-            # proxy maybe make trainers unreachable, so set them to ""
-            "http_proxy": "",
-            "https_proxy": ""
+            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
        })

        cmd = [sys.executable, "-u", args.training_script