From da556ed6d441f5438cab75c8e0dd82ed62344633 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 22 Jun 2018 09:39:39 +0800 Subject: [PATCH] enhance ParallelExecutor stable (#11637) --- .../framework/details/broadcast_op_handle.cc | 57 +++++-------------- .../fluid/framework/details/op_handle_base.cc | 36 +++--------- .../fluid/framework/details/op_handle_base.h | 4 -- .../framework/details/reduce_op_handle.cc | 4 +- paddle/fluid/platform/device_context.h | 8 --- 5 files changed, 25 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index b0bf641d9d..1d9f1bd6e4 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -103,50 +103,23 @@ void BroadcastOpHandle::RunImpl() { }); } - // FIXME(zcd): a temporary fix for some language model that has sparse - // parameter. - bool use_mutex = true; - if (in_var->IsType()) { - use_mutex = false; - } - if (use_mutex) { - this->RunAndRecordEvent([&] { - { - platform::NCCLGroupGuard guard; - for (auto &call : broadcast_calls) { - call(); - } - } - - if (!out_handle->IsTheSameVar(*in_var_handle)) { - auto out_var = var_scopes.at(in_var_handle->scope_idx_) - ->FindVar(out_var_handles[0]->name_); - paddle::framework::TensorCopy( - in_tensor, in_var_handle->place_, - *(dev_ctxes_.at(in_var_handle->place_)), - &VariableVisitor::GetMutableTensor(out_var)); - } - }); - } else { - this->RunAndRecordEventNoMutex([&] { - { - platform::NCCLGroupGuard guard; - for (auto &call : broadcast_calls) { - call(); - } - } - - if (!out_handle->IsTheSameVar(*in_var_handle)) { - auto out_var = var_scopes.at(in_var_handle->scope_idx_) - ->FindVar(out_var_handles[0]->name_); - paddle::framework::TensorCopy( - in_tensor, in_var_handle->place_, - *(dev_ctxes_.at(in_var_handle->place_)), - &VariableVisitor::GetMutableTensor(out_var)); + this->RunAndRecordEvent([&] { + { + platform::NCCLGroupGuard guard; + for (auto &call : broadcast_calls) { + call(); } - }); - } + } + if (!out_handle->IsTheSameVar(*in_var_handle)) { + auto out_var = var_scopes.at(in_var_handle->scope_idx_) + ->FindVar(out_var_handles[0]->name_); + paddle::framework::TensorCopy( + in_tensor, in_var_handle->place_, + *(dev_ctxes_.at(in_var_handle->place_)), + &VariableVisitor::GetMutableTensor(out_var)); + } + }); #else PADDLE_THROW("CUDA is not enabled."); #endif diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index a40a881508..1f84c3b9e2 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/op_handle_base.h" +#include namespace paddle { namespace framework { @@ -122,35 +122,17 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event std::function method = callback; - + // NOTE(zcd): device context must be ordered here because RecordEvent + // will use a mutex to ensure the safe of multi-threads. + std::map ordered_ctxes; for (auto &p : dev_ctxes_) { - method = [method, p, this]() { - static_cast(p.second)->RecordEvent( - events_.at(boost::get(p.first).device), - method); - }; + ordered_ctxes.emplace(p.second, p.first); } - method(); - } else { -#endif - callback(); -#ifdef PADDLE_WITH_CUDA - } -#endif -} - -void OpHandleBase::RunAndRecordEventNoMutex( - const std::function &callback) { -#ifdef PADDLE_WITH_CUDA - if (!events_.empty()) { // Use event - std::function method = callback; - - for (auto &p : dev_ctxes_) { + for (auto &p : ordered_ctxes) { method = [method, p, this]() { - static_cast(p.second) - ->RecordEventNoMutex( - events_.at(boost::get(p.first).device), - method); + static_cast(p.first)->RecordEvent( + events_.at(boost::get(p.second).device), + method); }; } method(); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 775be0233a..fbd90a3296 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -85,10 +85,6 @@ class OpHandleBase { protected: void RunAndRecordEvent(const std::function &callback); - // FIXME(zcd): A temporary fix for some language model that has sparse - // parameter. - void RunAndRecordEventNoMutex(const std::function &callback); - void RunAndRecordEvent(platform::Place p, const std::function &callback); diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 9a626c890f..7160e346da 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -80,9 +80,7 @@ void ReduceOpHandle::RunImpl() { } if (pre_in_var->IsType()) { - // FIXME(zcd): A temporary fix for some language model that has sparse - // parameter. - this->RunAndRecordEventNoMutex([&] { + this->RunAndRecordEvent([&] { std::vector in_selected_rows = GetInputValues(in_var_handles, var_scopes); GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d37e5ee578..292ffef1ae 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -106,14 +106,6 @@ class CUDADeviceContext : public DeviceContext { PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } - // FIXME(zcd): A temporary fix for some language model that has sparse - // parameter. - template - void RecordEventNoMutex(cudaEvent_t ev, Callback callback) { - callback(); - PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); - } - private: CUDAPlace place_; -- GitLab