diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 4779647435411ff838dbad6481d3527887634ddd..d6811aa6e0c3bda832a935a1a6c7bb04308f1c95 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry) @@ -11,12 +9,16 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) if(WITH_GPU) + nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda) else() set(multi_devices_graph_builder_deps) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim) endif() cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps}) + scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -24,11 +26,10 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory) -cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim) cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory - device_context reduce_op_handle) + device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index ecaa83eb7ebfc227d1e563deca8fbea8caee4cc5..c805d15fbbf99381ce84731c12ca2be8b85ecd81 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -13,30 +13,16 @@ // limitations under the License. #include "paddle/fluid/framework/details/reduce_op_handle.h" -#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" -#include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace framework { namespace details { -std::vector GetValidVarHandle( - const std::vector &inputs) { - std::vector in_var_handles; - for (auto *in : inputs) { - auto *in_handle = dynamic_cast(in); - if (in_handle) { - in_var_handles.push_back(in_handle); - } - } - return in_var_handles; -} - void ReduceOpHandle::RunImpl() { // the input and output may have dummy var. - std::vector in_var_handles = GetValidVarHandle(inputs_); - std::vector out_var_handles = GetValidVarHandle(outputs_); + std::vector in_var_handles = GetValidVarHandles(inputs_); + std::vector out_var_handles = GetValidVarHandles(outputs_); PADDLE_ENFORCE_EQ( in_var_handles.size(), places_.size(), @@ -45,15 +31,10 @@ void ReduceOpHandle::RunImpl() { "The number of output should be one."); // Wait input done, this Wait is asynchronous operation - if (in_var_handles[0]->generated_op_) { - for (auto *in : in_var_handles) { - auto &in_p = in->place_; - in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in_p]); - } - } + WaitEvents(in_var_handles); // check in the same place - auto in_0_handle = static_cast(in_var_handles[0]); + auto in_0_handle = in_var_handles[0]; auto pre_place = in_0_handle->place_; std::vector in_places; @@ -120,6 +101,7 @@ void ReduceOpHandle::RunImpl() { for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = in_places[i]; auto &lod_tensor = lod_tensors[i]; + int dev_id = boost::get(p).device; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); @@ -139,18 +121,41 @@ void ReduceOpHandle::RunImpl() { }); } - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); - } + this->RunAndRecordEvent([&] { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + }); #else PADDLE_THROW("CUDA is not support."); #endif } else { - PADDLE_THROW("Error"); + PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); } } } + +void ReduceOpHandle::WaitEvents( + const std::vector &in_var_handles) { + if (in_var_handles[0]->generated_op_) { + for (auto *in : in_var_handles) { + in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]); + } + } +} + +std::vector ReduceOpHandle::GetValidVarHandles( + const std::vector &inputs) { + std::vector in_var_handles; + for (auto *in : inputs) { + auto *in_handle = dynamic_cast(in); + if (in_handle) { + in_var_handles.push_back(in_handle); + } + } + return in_var_handles; +} std::string ReduceOpHandle::Name() const { return "reduce"; } } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 0e91ad20695d7599737f02b9856535326a171808..7b36ce4a7bceaeb93ceb03730b2d54d0f36fed3d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -23,7 +23,9 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { @@ -57,6 +59,10 @@ struct ReduceOpHandle : public OpHandleBase { protected: void RunImpl() override; + std::vector GetValidVarHandles( + const std::vector &inputs); + + void WaitEvents(const std::vector &in_var_handles); }; } // namespace details