diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index d7f2eedbe607bb1217c3a5452c1408307bbc5e09..17460a15180e5572929c6bce2f39c038a3d2201b 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include -#include "paddle/fluid/framework/details/reduce_util.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7f21427330ad55cbe2c16b6547812612bd2c8983..ecaa83eb7ebfc227d1e563deca8fbea8caee4cc5 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -121,7 +121,7 @@ void ReduceOpHandle::RunImpl() { auto &p = in_places[i]; auto &lod_tensor = lod_tensors[i]; int dev_id = boost::get(p).device; - auto &nccl_ctx = nccl_ctxs_.at(dev_id); + auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 0bfd83c71fd7d8a3e90b0d8d16dc0e947d611594..0e91ad20695d7599737f02b9856535326a171808 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -34,13 +34,15 @@ struct ReduceOpHandle : public OpHandleBase { const std::vector &places_; #ifdef PADDLE_WITH_CUDA - const platform::NCCLContextMap &nccl_ctxs_; + const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(const std::vector &local_scopes, const std::vector &places, - const platform::NCCLContextMap &nccl_ctxs) + const platform::NCCLContextMap *nccl_ctxs) : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) { - for (auto &p_ctx : nccl_ctxs_.contexts_) { - dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + if (nccl_ctxs_) { + for (auto &p_ctx : nccl_ctxs_->contexts_) { + dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + } } } #else diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 74ed6bf2ac5d88383a44755a825c143bd279b603..b0b8eb2cc77bc8d56f89c8adce96e342774c3efa 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -44,7 +44,9 @@ struct TestReduceOpHandle { ctxs_[j]->Wait(); } #ifdef PADDLE_WITH_CUDA - nccl_ctxs_->WaitAll(); + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } #endif } @@ -64,6 +66,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CUDADeviceContext(p)); } + nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); #else PADDLE_THROW("CUDA is not support."); #endif @@ -74,10 +77,10 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } - } #ifdef PADDLE_WITH_CUDA - nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); + nccl_ctxs_.reset(nullptr); #endif + } } void InitReduceOp(size_t input_scope_idx) { @@ -87,15 +90,27 @@ struct TestReduceOpHandle { } local_scopes_[input_scope_idx]->Var("input"); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset( + new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { #ifdef PADDLE_WITH_CUDA - op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_, *nccl_ctxs_)); + op_handle_.reset( + new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); #else - op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_)); + op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_)); #endif + } // add input for (size_t j = 0; j < gpu_list_.size(); ++j) { - op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); + if (!use_gpu_) { + op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); + } vars_.emplace_back(new VarHandle()); VarHandle *in_var_handle = static_cast(vars_.back().get()); in_var_handle->place_ = gpu_list_[j]; @@ -236,25 +251,31 @@ TEST(ReduceTester, TestCPUReduceTestSelectedRows) { test_op.InitReduceOp(input_scope_idx); test_op.TestReduceSelectedRows(input_scope_idx); } +TEST(ReduceTester, TestCPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceLodTensors(input_scope_idx); +} +#ifdef PADDLE_WITH_CUDA -// #ifdef PADDLE_WITH_CUDA -// -// TEST(ReduceTester, TestGPUReduceTestSelectedRows) { -// TestReduceOpHandle test_op; -// size_t input_scope_idx = 0; -// test_op.InitCtxOnGpu(true); -// test_op.InitReduceOp(input_scope_idx); -// test_op.TestReduceSelectedRows(input_scope_idx); -// } -// -// TEST(ReduceTester, TestCPUReduceTestLodTensor) { -// TestReduceOpHandle test_op; -// size_t input_scope_idx = 0; -// test_op.InitCtxOnGpu(true); -// test_op.InitReduceOp(input_scope_idx); -// test_op.TestReduceLodTensors(input_scope_idx); -// } -// #endif +TEST(ReduceTester, TestGPUReduceTestSelectedRows) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceSelectedRows(input_scope_idx); +} + +TEST(ReduceTester, TestGPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceLodTensors(input_scope_idx); +} +#endif } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_util.h b/paddle/fluid/framework/details/reduce_util.h deleted file mode 100644 index 5d803e9923dcb1a3b23b4a8e61cb981d7ad5838f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/reduce_util.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/details/reduce_util.h" -namespace paddle { -namespace framework { -namespace details { - -struct ReduceLoDTensor { - const std::vector &src_tensors_; - LoDTensor &dst_tensor_; - - ReduceLoDTensor(const std::vector &src, LoDTensor *dst) - : src_tensors_(src), dst_tensor_(*dst) {} - - template - void operator()() const { - PADDLE_ENFORCE(!src_tensors_.empty()); - auto &t0 = src_tensors_[0]; - PADDLE_ENFORCE_NE(t0.numel(), 0); - dst_tensor_.Resize(t0.dims()); - T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - std::copy(t0.data(), t0.data() + t0.numel(), dst); - - for (size_t i = 1; i < src_tensors_.size(); ++i) { - auto &t = src_tensors_[i]; - PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); - PADDLE_ENFORCE_EQ(t.type(), t0.type()); - std::transform(t.data(), t.data() + t.numel(), dst, dst, - [](T a, T b) -> T { return a + b; }); - } - } -}; - -} // namespace details -} // namespace framework -} // namespace paddle