From cedade949412a1fcffa12714375e03e4234282af Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 13 Apr 2018 16:30:08 +0800 Subject: [PATCH] Stash --- .../details/nccl_all_reduce_op_handle.cc | 84 ++++++++++++++----- 1 file changed, 64 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 55b5f11358..6e4314e2a8 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" +#include + namespace paddle { namespace framework { namespace details { @@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( } } +struct ReduceLoDTensor { + const std::vector &src_tensors_; + LoDTensor &dst_tensor_; + + ReduceLoDTensor(const std::vector &src, LoDTensor *dst) + : src_tensors_(src), dst_tensor_(*dst) {} + + template + void operator()() const { + PADDLE_ENFORCE(!src_tensors_.empty()); + auto &t0 = src_tensors_[0]; + PADDLE_ENFORCE_NE(t0.numel(), 0); + dst_tensor_.Resize(t0.dims()); + T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); + std::copy(t0.data(), t0.data() + t0.numel(), dst); + + for (size_t i = 1; i < src_tensors_.size(); ++i) { + auto &t = src_tensors_[i]; + PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); + PADDLE_ENFORCE_EQ(t.type(), t0.type()); + std::transform(t.data(), t.data() + t.numel(), dst, dst, + [](T a, T b) -> T { return a + b; }); + } + } +}; + void NCCLAllReduceOpHandle::RunImpl() { if (inputs_.size() == 1) { return; // No need to all reduce when GPU count = 1; @@ -41,37 +69,53 @@ void NCCLAllReduceOpHandle::RunImpl() { int dtype = -1; size_t numel = 0; - std::vector> all_reduce_calls; + std::vector lod_tensors; for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; auto *s = local_scopes_[i]; - int dev_id = boost::get(p).device; auto &lod_tensor = s->FindVar(var_name)->Get(); - void *buffer = const_cast(lod_tensor.data()); + lod_tensors.emplace_back(lod_tensor); + } + + if (platform::is_gpu_place(lod_tensors[0].place())) { + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto &lod_tensor = lod_tensors[i]; + void *buffer = const_cast(lod_tensor.data()); - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_.at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), + ncclSum, comm, stream)); + }); } - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); } + } else { // Special handle CPU only Operator's gradient. Like CRF + framework::LoDTensor trg; - auto &nccl_ctx = nccl_ctxs_.at(dev_id); - auto stream = nccl_ctx.stream(); - auto comm = nccl_ctx.comm_; - all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), ncclSum, - comm, stream)); - }); - } + // Reduce All Tensor to trg in CPU + ReduceLoDTensor func(lod_tensors, &trg); + VisitDataType(ToDataType(lod_tensors[0].type()), func); - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); + // Copy trg to GPU } } } -- GitLab