提交 e4de957f 编写于 作者: C chengduoZH

code refine

上级 3301d440
...@@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) ...@@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
...@@ -11,12 +9,16 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) ...@@ -11,12 +9,16 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU) if(WITH_GPU)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda)
else() else()
set(multi_devices_graph_builder_deps) set(multi_devices_graph_builder_deps)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
endif() endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps}) scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...@@ -24,11 +26,10 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS ...@@ -24,11 +26,10 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle) device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle) device_context gather_op_handle)
cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context reduce_op_handle) device_context reduce_op_handle )
...@@ -13,30 +13,16 @@ ...@@ -13,30 +13,16 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
std::vector<VarHandle *> GetValidVarHandle(
const std::vector<VarHandleBase *> &inputs) {
std::vector<VarHandle *> in_var_handles;
for (auto *in : inputs) {
auto *in_handle = dynamic_cast<VarHandle *>(in);
if (in_handle) {
in_var_handles.push_back(in_handle);
}
}
return in_var_handles;
}
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
// the input and output may have dummy var. // the input and output may have dummy var.
std::vector<VarHandle *> in_var_handles = GetValidVarHandle(inputs_); std::vector<VarHandle *> in_var_handles = GetValidVarHandles(inputs_);
std::vector<VarHandle *> out_var_handles = GetValidVarHandle(outputs_); std::vector<VarHandle *> out_var_handles = GetValidVarHandles(outputs_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
...@@ -45,15 +31,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -45,15 +31,10 @@ void ReduceOpHandle::RunImpl() {
"The number of output should be one."); "The number of output should be one.");
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
if (in_var_handles[0]->generated_op_) { WaitEvents(in_var_handles);
for (auto *in : in_var_handles) {
auto &in_p = in->place_;
in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in_p]);
}
}
// check in the same place // check in the same place
auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]); auto in_0_handle = in_var_handles[0];
auto pre_place = in_0_handle->place_; auto pre_place = in_0_handle->place_;
std::vector<platform::Place> in_places; std::vector<platform::Place> in_places;
...@@ -120,6 +101,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -120,6 +101,7 @@ void ReduceOpHandle::RunImpl() {
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = in_places[i]; auto &p = in_places[i];
auto &lod_tensor = lod_tensors[i]; auto &lod_tensor = lod_tensors[i];
int dev_id = boost::get<platform::CUDAPlace>(p).device; int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
...@@ -139,18 +121,41 @@ void ReduceOpHandle::RunImpl() { ...@@ -139,18 +121,41 @@ void ReduceOpHandle::RunImpl() {
}); });
} }
platform::NCCLGroupGuard guard; this->RunAndRecordEvent([&] {
for (auto &call : all_reduce_calls) { platform::NCCLGroupGuard guard;
call(); for (auto &call : all_reduce_calls) {
} call();
}
});
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not support.");
#endif #endif
} else { } else {
PADDLE_THROW("Error"); PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
} }
} }
} }
void ReduceOpHandle::WaitEvents(
const std::vector<VarHandle *> &in_var_handles) {
if (in_var_handles[0]->generated_op_) {
for (auto *in : in_var_handles) {
in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]);
}
}
}
std::vector<VarHandle *> ReduceOpHandle::GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs) {
std::vector<VarHandle *> in_var_handles;
for (auto *in : inputs) {
auto *in_handle = dynamic_cast<VarHandle *>(in);
if (in_handle) {
in_var_handles.push_back(in_handle);
}
}
return in_var_handles;
}
std::string ReduceOpHandle::Name() const { return "reduce"; } std::string ReduceOpHandle::Name() const { return "reduce"; }
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -57,6 +59,10 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -57,6 +59,10 @@ struct ReduceOpHandle : public OpHandleBase {
protected: protected:
void RunImpl() override; void RunImpl() override;
std::vector<VarHandle *> GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs);
void WaitEvents(const std::vector<VarHandle *> &in_var_handles);
}; };
} // namespace details } // namespace details
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册