提交 e7670b7b 编写于 作者: X Xi Chen

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-sendrec-ci

...@@ -2,4 +2,15 @@ ...@@ -2,4 +2,15 @@
Cluster Training and Prediction Cluster Training and Prediction
############################### ###############################
TBD .. contents::
1. Network connection errors in the log during multi-node cluster training
------------------------------------------------
There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
...@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
void* dest, int size) { void* dest, int size) {
const void* data = NULL; const void* data = NULL;
int size_to_write = 0; int size_to_write = 0;
int length = size;
int total_written = 0;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
platform::CPUPlace cpu; platform::CPUPlace cpu;
char* p = reinterpret_cast<char*>(dest); char* p = reinterpret_cast<char*>(dest);
while (size > 0) { while (total_written < length) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false; return false;
} }
// NOTE: if raw buffer is large and have two neighbor fields of raw
// buffers GetDirectBufferPointer can get all of them, use length to
// truncate it.
if (total_written + size_to_write > length) {
size_to_write = length - total_written;
}
memory::Copy(boost::get<platform::CUDAPlace>(place), memory::Copy(boost::get<platform::CUDAPlace>(place),
reinterpret_cast<void*>(p), cpu, data, size_to_write, reinterpret_cast<void*>(p), cpu, data, size_to_write,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
p += size_to_write; p += size_to_write;
size -= size_to_write; total_written += size_to_write;
input->Skip(size_to_write); input->Skip(size_to_write);
} }
...@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
} }
char* p = reinterpret_cast<char*>(dest); char* p = reinterpret_cast<char*>(dest);
while (size > 0) { while (total_written < length) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false; return false;
} }
// NOTE: if raw buffer is large and have two neighbor fields of raw buffers
// GetDirectBufferPointer can get all of them, use length to truncate it.
if (total_written + size_to_write > length) {
size_to_write = length - total_written;
}
// TODO(gongwb): can we avoid copy? // TODO(gongwb): can we avoid copy?
platform::CPUPlace cpu; platform::CPUPlace cpu;
memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write); memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
p += size_to_write; p += size_to_write;
size -= size_to_write; total_written += size_to_write;
input->Skip(size_to_write); input->Skip(size_to_write);
} }
...@@ -153,6 +165,7 @@ bool VariableResponse::CopySelectRowsData( ...@@ -153,6 +165,7 @@ bool VariableResponse::CopySelectRowsData(
const platform::DeviceContext& ctx, int length) { const platform::DeviceContext& ctx, int length) {
auto var = scope_->FindVar(meta_.varname()); auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
slr->mutable_rows()->resize(length / 8); // int64
int64_t* rows_data = slr->mutable_rows()->data(); int64_t* rows_data = slr->mutable_rows()->data();
// copy rows CPU data, GPU data will be copied lazily. // copy rows CPU data, GPU data will be copied lazily.
...@@ -233,7 +246,6 @@ int VariableResponse::Parse(Source* source) { ...@@ -233,7 +246,6 @@ int VariableResponse::Parse(Source* source) {
if (tag != 0) { if (tag != 0) {
return -1; return -1;
} }
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册