diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c749c97f13649fe8432091414b56f7d0ea8ace8b..3fe750f47efc149bb1af6086841bffd5dd8e85fd 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME) get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but + # somehow it didn't. line 602 to 604 is to patching this. Leaving this here + # for now to enable dist CI. protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") @@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME) COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + "${ABS_PROTO}" DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst index 855b7e8e53307b82a72c156be4ef509e27edf822..fa942a09625bef78b28456beeb735272b686e061 100644 --- a/doc/v2/faq/cluster/index_en.rst +++ b/doc/v2/faq/cluster/index_en.rst @@ -2,4 +2,15 @@ Cluster Training and Prediction ############################### -TBD +.. contents:: + +1. Network connection errors in the log during multi-node cluster training +------------------------------------------------ +There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`. +This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows: + +* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk. + +* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian. + +* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster. diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 3693bc25d81a8309df1a6ddf3d9b08d484596ea9..fbe08349c37c4fde115ceea954ba2b84880088d7 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -147,15 +147,52 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { return; } + auto get_vars = [](std::deque>::iterator &op, + std::vector &v) { + auto in_names = (*op)->InputArgumentNames(); + v.insert(v.end(), in_names.begin(), in_names.end()); + auto out_names = (*op)->OutputArgumentNames(); + v.insert(v.end(), out_names.begin(), out_names.end()); + std::sort(v.begin(), v.end()); + auto last = std::unique(v.begin(), v.end()); + v.erase(last, v.end()); + }; need_update_ = true; - for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { - auto names = (*it)->InputArgumentNames(); - for (auto n : names) { - // TODO(typhoonzero): delete vars if no other op use it. - VLOG(3) << "deleting var " << n; + + for (size_t i = s; i < e; i++) { + // since remove op one by one, every time remove the first op. + auto op = ops_.begin() + s; + + // collect input and output variables from current delete op + std::vector cur_vars; + get_vars(op, cur_vars); + + // remove current op + ops_.erase(ops_.begin() + s); + + // collect input and output variables from other ops + std::vector other_vars; + for (auto it = ops_.begin(); it != ops_.end(); it++) { + get_vars(it, other_vars); + } + + // variables should be deleted + std::vector delete_vars; + // delete_vars = cur_vars - cur_vars ^ other_input_vars + std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(), + other_vars.end(), + std::inserter(delete_vars, delete_vars.end())); + // remove variables + for (size_t i = 0; i < delete_vars.size(); i++) { + auto name = delete_vars[i]; + auto it = vars_.find(name); + PADDLE_ENFORCE(it != vars_.end(), + "%s is not in variable list, it should not be deleted", + name); + vars_.erase(it); + VLOG(3) << "deleting variable " << name; } } - ops_.erase(ops_.begin() + s, ops_.begin() + e); } std::vector BlockDesc::AllOps() const { diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 185f018ac1b5863e0ee86fdaa17df1ccbc6e030e..468423e0e8e7b8c9ebc14b7568c9c3bd21645ea7 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -89,6 +89,11 @@ class BlockDesc { OpDesc *InsertOp(size_t index); + /* + * Remove Op and its input/output variables. + * Note that for either input or ouput variable, if it is also an input or + * output variable of other ops, we should remain it. + */ void RemoveOp(size_t s, size_t e); std::vector AllOps() const; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9a11e1be7050adb1803b1fd835ffb811d9cae4cd..8341170d6897d71ddf95d4de95f521f5d31ab7cd 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -264,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 12e8eb0b4da2252b104415aef4156bf100c3e565..bdda5703436765480f353ee964624364f45dbefb 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, void* dest, int size) { const void* data = NULL; int size_to_write = 0; + int length = size; + int total_written = 0; if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA @@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, platform::CPUPlace cpu; char* p = reinterpret_cast(dest); - while (size > 0) { + while (total_written < length) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) { return false; } - + // NOTE: if raw buffer is large and have two neighbor fields of raw + // buffers GetDirectBufferPointer can get all of them, use length to + // truncate it. + if (total_written + size_to_write > length) { + size_to_write = length - total_written; + } memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); p += size_to_write; - size -= size_to_write; + total_written += size_to_write; input->Skip(size_to_write); } @@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, } char* p = reinterpret_cast(dest); - while (size > 0) { + while (total_written < length) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) { return false; } + // NOTE: if raw buffer is large and have two neighbor fields of raw buffers + // GetDirectBufferPointer can get all of them, use length to truncate it. + if (total_written + size_to_write > length) { + size_to_write = length - total_written; + } // TODO(gongwb): can we avoid copy? platform::CPUPlace cpu; memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); p += size_to_write; - size -= size_to_write; + total_written += size_to_write; input->Skip(size_to_write); } @@ -153,6 +165,7 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto var = scope_->FindVar(meta_.varname()); auto* slr = var->GetMutable(); + slr->mutable_rows()->resize(length / 8); // int64 int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. @@ -233,7 +246,6 @@ int VariableResponse::Parse(Source* source) { if (tag != 0) { return -1; } - return 0; } diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 94382739b5077b1449a8fd5be7952f35737ca340..184c095e487a302ebc4d251dd6f332333c415c6d 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel { y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); @@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel { T><<>>( size, seed, dropout_prob, x_data, mask_data, y_data); } else { + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); Y.device(place) = X * static_cast(1.0f - dropout_prob); } } diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..db97ba4f64105c37c49cafbc3fbc4829c5077467 --- /dev/null +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(dropout); + +void Compare(f::Scope& scope, p::DeviceContext& ctx) { + // init + auto var = scope.Var("X"); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + + std::vector init; + for (int64_t i = 0; i < 10 * 10; ++i) { + init.push_back(1.0); + } + + TensorFromVector(init, ctx, tensor); + + auto place = ctx.GetPlace(); + auto out_var = scope.Var("Out"); + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({10, 10}); + out_tensor->mutable_data(place); // allocate + + auto mask_var = scope.Var("Mask"); + auto mask_tensor = mask_var->GetMutable(); + mask_tensor->Resize({10, 10}); + mask_tensor->mutable_data(place); // allocate + + // run + f::AttributeMap attrs; + float dropout_prob = 0.5; + attrs.insert({"fix_seed", 1}); + attrs.insert({"seed", 3}); + attrs.insert({"dropout_prob", dropout_prob}); + auto dropout_op = f::OpRegistry::CreateOp( + "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); + + dropout_op->Run(scope, place); + + std::vector out_vec; + TensorToVector(*out_tensor, ctx, &out_vec); + + std::vector std_out = { + 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, + 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1}; + + EXPECT_EQ(out_vec.size(), std_out.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], std_out[i]); + } +} + +TEST(Dropout, CPUDense) { + f::Scope scope; + p::CPUPlace place; + p::CPUDeviceContext ctx(place); + Compare(scope, ctx); +} + +TEST(Dropout, GPUDense) { + f::Scope scope; + p::CUDAPlace place; + p::CUDADeviceContext ctx(place); + Compare(scope, ctx); +} diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 4001b9a130348b4e3ea99f3017eae6d85e41fc6e..b28c16b13fce30c6e9be9953009b53e722cf4885 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase { PADDLE_ENFORCE(scope.FindVar(param)->IsType(), "Only support parameter type as LoDTensor"); auto &src = scope.FindVar(param)->Get(); - for (size_t i = 0; i < sub_scopes.size(); ++i) { + + auto *sub_scope0 = sub_scopes[0]; + auto *dst0 = sub_scope0->Var(param)->GetMutable(); + dst0->ShareDataWith(src); + + for (size_t i = 1; i < sub_scopes.size(); ++i) { auto &place = places[i]; auto *sub_scope = sub_scopes[i]; auto *dst = sub_scope->Var(param)->GetMutable(); diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index 80fa0c72af65cbdc21ba955389318a233e02657c..1283de9d957a46b848c7bb6caf9c5f49398468e2 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then exit 1 fi -INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'` +if [ "@WITH_GPU@" == "ON" ]; then + PADDLE_NAME="paddlepaddle-gpu" +else + PADDLE_NAME="paddlepaddle" +fi + +INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'` -if [ -z ${INSTALLED_VERSION} ]; then +if [ -z "${INSTALLED_VERSION}" ]; then INSTALLED_VERSION="0.0.0" # not installed fi cat < 1e-5: tmp = layers.dropout(x=tmp, dropout_prob=drop_rate) diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 309ea2b9b7ede442da3ac897ce8d1a4b9aa68233..da85786d0c085a4e97d9ac272feed251296ad52d 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -186,6 +186,34 @@ class TestBlockDesc(unittest.TestCase): all_ops.append(block.op(idx)) self.assertEqual(all_ops, [op0, op1, op2]) + def test_remove_op(self): + prog = core.ProgramDesc() + self.assertIsNotNone(prog) + block = prog.block(0) + self.assertIsNotNone(block) + op1 = block.append_op() + op2 = block.append_op() + var1 = block.var("var1") + var2 = block.var("var2") + var3 = block.var("var3") + var4 = block.var("var4") + var5 = block.var("var5") + op1.set_input("X", ["var1", "var2"]) + op1.set_output("Y", ["var3", "var4"]) + op2.set_input("X", ["var1"]) + op2.set_output("Y", ["var4", "var5"]) + + # remove op1, its input var2 and output var3 will be removed at the same time, + # but its input var1 and output var4 will not be removed since they are used for op2. + block.remove_op(0, 1) + + all_ops = [] + for idx in xrange(0, block.op_size()): + all_ops.append(block.op(idx)) + self.assertEqual(all_ops, [op2]) + all_vars = block.all_vars() + self.assertEqual(set(all_vars), {var1, var4, var5}) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py index 00efc01c0592107314f5b23c951706d039d49a88..3683968262266a2d654d2480b828173bc761152b 100644 --- a/python/paddle/trainer_config_helpers/activations.py +++ b/python/paddle/trainer_config_helpers/activations.py @@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation): .. math:: - P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} } + P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} } """ def __init__(self):