Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-sendrec-ci

e7670b7b · Xi Chen · 1daa9657 · eb62a778 · e7670b7b · e7670b7b
隐藏空白更改
内联并排

Showing with 30 addition and 7 deletion

doc/v2/faq/cluster/index_en.rst doc/v2/faq/cluster/index_en.rst +12 -1

paddle/fluid/operators/detail/variable_response.cc paddle/fluid/operators/detail/variable_response.cc +18 -6

未找到文件。
--- a/doc/v2/faq/cluster/index_en.rst
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -2,4 +2,15 @@
 Cluster Training and Prediction
 ###############################

-TBD
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
             void* dest, int size) {
  const void* data = NULL;
  int size_to_write = 0;
+  int length = size;
+  int total_written = 0;

  if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
    platform::CPUPlace cpu;

    char* p = reinterpret_cast<char*>(dest);
-    while (size > 0) {
+    while (total_written < length) {
      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
        return false;
      }
-
+      // NOTE: if raw buffer is large and have two neighbor fields of raw
+      // buffers GetDirectBufferPointer can get all of them, use length to
+      // truncate it.
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
      p += size_to_write;
-      size -= size_to_write;
+      total_written += size_to_write;

      input->Skip(size_to_write);
    }
@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
  }

  char* p = reinterpret_cast<char*>(dest);
-  while (size > 0) {
+  while (total_written < length) {
    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
      return false;
    }
+    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
+    // GetDirectBufferPointer can get all of them, use length to truncate it.
+    if (total_written + size_to_write > length) {
+      size_to_write = length - total_written;
+    }
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);

    p += size_to_write;
-    size -= size_to_write;
+    total_written += size_to_write;

    input->Skip(size_to_write);
  }
@@ -153,6 +165,7 @@ bool VariableResponse::CopySelectRowsData(
    const platform::DeviceContext& ctx, int length) {
  auto var = scope_->FindVar(meta_.varname());
  auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->resize(length / 8);  // int64
  int64_t* rows_data = slr->mutable_rows()->data();

  // copy rows CPU data, GPU data will be copied lazily.
@@ -233,7 +246,6 @@ int VariableResponse::Parse(Source* source) {
      if (tag != 0) {
        return -1;
      }
-
      return 0;
    }