From f28ae6e4b16322310ec91fa3e7f6916f2aa79889 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 16:48:44 +0800 Subject: [PATCH] Reorganize Code --- paddle/fluid/framework/CMakeLists.txt | 8 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../details/nccl_all_reduce_op_handle.cc | 74 +++++++++++++++++++ .../details/nccl_all_reduce_op_handle.h | 41 ++++++++++ paddle/fluid/framework/parallel_executor.cc | 65 +--------------- 5 files changed, 126 insertions(+), 64 deletions(-) create mode 100644 paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/nccl_all_reduce_op_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cf288e78041..12d6541b8fa 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,9 +87,15 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) + +if(WITH_GPU) + set(parallel_executor_cuda_deps nccl_all_reduce_op_handle) +else() + set(parallel_executor_cuda_deps) +endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle - fetch_op_handle) + fetch_op_handle ${parallel_executor_cuda_deps}) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index aed444d9aa1..fb276ea7038 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -2,3 +2,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc new file mode 100644 index 00000000000..a79c61f3593 --- /dev/null +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { +NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( + const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap &ctxs) + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { + for (auto &p : places_) { + this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); + } +} + +void NCCLAllReduceOpHandle::RunImpl() { + if (inputs_.size() == 1) { + return; // No need to all reduce when GPU count = 1; + } else { + // Wait input done + for (auto *in : inputs_) { + auto &p = static_cast(in)->place_; + in->generated_op_->Wait(dev_ctx_[p]); + } + + auto &var_name = static_cast(this->inputs_[0])->name_; + int dtype = -1; + size_t numel = 0; + + platform::NCCLGroupGuard guard; + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto *s = local_scopes_[i]; + int dev_id = boost::get(p).device; + + auto &lod_tensor = s->FindVar(var_name)->Get(); + void *buffer = const_cast(lod_tensor.data()); + uintptr_t buf = reinterpret_cast(buffer); + if (buf % sizeof(float) != 0) { + VLOG(3) << "Buffer is not aligned " << buf; + } + + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + auto &nccl_ctx = nccl_ctxs_.at(dev_id); + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + nccl_ctx.comm_, nccl_ctx.stream())); + } + } +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h new file mode 100644 index 00000000000..7152d1a587e --- /dev/null +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +struct NCCLAllReduceOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + const platform::NCCLContextMap &nccl_ctxs_; + + NCCLAllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap &ctxs); + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8c29aacab6f..93db5ad3e5c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "lod_tensor_array.h" #include "op_registry.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -28,6 +29,7 @@ namespace framework { using details::DummyVarHandle; using details::FetchOpHandle; +using details::NCCLAllReduceOpHandle; using details::OpHandleBase; using details::ScaleLossGradOpHandle; using details::VarHandle; @@ -123,69 +125,6 @@ class ParallelExecutorPrivate { var.place_ = place; op_handle->AddOutput(&var); } -}; // namespace framework - -struct NCCLAllReduceOpHandle : public OpHandleBase { - const std::vector &local_scopes_; - const std::vector &places_; - const platform::NCCLContextMap &nccl_ctxs_; - - explicit NCCLAllReduceOpHandle(const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { - for (auto &p : places_) { - this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - OpHandleBase::Wait(waited_dev); - } - - protected: - void RunImpl() override { - if (inputs_.size() == 1) { - return; // No need to all reduce when GPU count = 1; - } else { - // Wait input done - for (auto *in : inputs_) { - auto &p = static_cast(in)->place_; - in->generated_op_->Wait(dev_ctx_[p]); - } - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; - - platform::NCCLGroupGuard guard; - - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto *s = local_scopes_[i]; - int dev_id = boost::get(p).device; - - auto &lod_tensor = s->FindVar(var_name)->Get(); - void *buffer = const_cast(lod_tensor.data()); - uintptr_t buf = reinterpret_cast(buffer); - if (buf % sizeof(float) != 0) { - VLOG(3) << "Buffer is not aligned " << buf; - } - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } - - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); - } - auto &nccl_ctx = nccl_ctxs_.at(dev_id); - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), ncclSum, - nccl_ctx.comm_, nccl_ctx.stream())); - } - } - } }; struct ComputationOpHandle : public OpHandleBase { -- GitLab