diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 181f08d028919d3d55821186d777f3a8a636ae3a..6f990e28666829dd2f2fe6f07362188a77ae6468 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry) @@ -11,12 +9,17 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) if(WITH_GPU) + nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda) else() set(multi_devices_graph_builder_deps) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim) endif() cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps}) + scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps}) + cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) @@ -30,3 +33,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) +cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory + device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index e587210b357ea6caa3272903d8aa6b3e4b2e8228..28f9139987faa3dfee1e7733fb846a4d4efadc7b 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" - #include +#include "paddle/fluid/framework/details/reduce_and_gather.h" namespace paddle { namespace framework { @@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( } } -struct ReduceLoDTensor { - const std::vector &src_tensors_; - LoDTensor &dst_tensor_; - - ReduceLoDTensor(const std::vector &src, LoDTensor *dst) - : src_tensors_(src), dst_tensor_(*dst) {} - - template - void operator()() const { - PADDLE_ENFORCE(!src_tensors_.empty()); - auto &t0 = src_tensors_[0]; - PADDLE_ENFORCE_NE(t0.numel(), 0); - dst_tensor_.Resize(t0.dims()); - T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - std::copy(t0.data(), t0.data() + t0.numel(), dst); - - for (size_t i = 1; i < src_tensors_.size(); ++i) { - auto &t = src_tensors_[i]; - PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); - PADDLE_ENFORCE_EQ(t.type(), t0.type()); - std::transform(t.data(), t.data() + t.numel(), dst, dst, - [](T a, T b) -> T { return a + b; }); - } - } -}; - void NCCLAllReduceOpHandle::RunImpl() { if (inputs_.size() == 1) { return; // No need to all reduce when GPU count = 1; diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h new file mode 100644 index 0000000000000000000000000000000000000000..7957fba8a449f7dc05588fad335df0b45a34b575 --- /dev/null +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -0,0 +1,94 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +namespace paddle { +namespace framework { +namespace details { + +struct ReduceLoDTensor { + const std::vector &src_tensors_; + LoDTensor &dst_tensor_; + + ReduceLoDTensor(const std::vector &src, LoDTensor *dst) + : src_tensors_(src), dst_tensor_(*dst) {} + + template + void operator()() const { + PADDLE_ENFORCE(!src_tensors_.empty()); + auto &t0 = src_tensors_[0]; + PADDLE_ENFORCE_NE(t0.numel(), 0); + dst_tensor_.Resize(t0.dims()); + T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); + std::copy(t0.data(), t0.data() + t0.numel(), dst); + + for (size_t i = 1; i < src_tensors_.size(); ++i) { + auto &t = src_tensors_[i]; + PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); + PADDLE_ENFORCE_EQ(t.type(), t0.type()); + std::transform(t.data(), t.data() + t.numel(), dst, dst, + [](T a, T b) -> T { return a + b; }); + } + } +}; + +inline void GatherSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::unordered_map &dev_ctxes, + const platform::Place &out_place, SelectedRows *dst_selecte_rows) { + PADDLE_ENFORCE(!src_selecte_rows_.empty()); + + std::vector in_tensors; + std::vector out_rows; + + for (auto in_sr_ptr : src_selecte_rows_) { + auto &in_sr = *in_sr_ptr; + in_tensors.emplace_back(in_sr.value()); + out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end()); + } + + auto &pre_in = src_selecte_rows_[0]; + + auto &dst_tensor = *dst_selecte_rows; + dst_tensor.set_height(pre_in->height()); + dst_tensor.set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in->GetCompleteDims(); + out_dim[0] = static_cast(rows); + dst_tensor.mutable_value()->Resize(out_dim); + dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type()); + Tensor *out_tensor = dst_tensor.mutable_value(); + + // copy + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors.size(); ++j) { + e += in_tensors[j].dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(in_tensors[j], out_place, + *(dev_ctxes.at(in_places[j])), &sub_out); + s = e; + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..c951de5dd5a7b66ce03c705e9bdcbe3f5c3e565d --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -0,0 +1,161 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" + +namespace paddle { +namespace framework { +namespace details { + +void ReduceOpHandle::RunImpl() { + // the input and output may have dummy var. + std::vector in_var_handles = GetValidVarHandles(inputs_); + std::vector out_var_handles = GetValidVarHandles(outputs_); + + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, + "The number of output should be one."); + + // Wait input done, this Wait is asynchronous operation + WaitEvents(in_var_handles); + + // check in the same place + auto in_0_handle = in_var_handles[0]; + auto pre_place = in_0_handle->place_; + + std::vector in_places; + for (auto *in_handle : in_var_handles) { + auto in_p = in_handle->place_; + PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(), + "Places must be all on CPU or all on CUDA."); + in_places.emplace_back(in_p); + } + + auto out_var = local_scopes_[out_var_handles[0]->scope_idx_]->FindVar( + out_var_handles[0]->name_); + + auto pre_in_var = + local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_); + + if (pre_in_var->IsType()) { + auto &pre_in = pre_in_var->Get(); + std::vector in_selected_rows; + + for (auto *in_handle : in_var_handles) { + auto in_var = + local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_); + auto &in_sr = in_var->Get(); + + PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(), + "The type of input is not consistent."); + + in_selected_rows.emplace_back(&in_sr); + } + auto trg = out_var->GetMutable(); + GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, + out_var_handles[0]->place_, trg); + } else { + auto pre_in = pre_in_var->Get(); + std::vector lod_tensors; + + // can be refined + for (auto *in_handle : in_var_handles) { + auto in_var = + local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_); + auto &in_sr = in_var->Get(); + + PADDLE_ENFORCE_EQ(in_sr.type(), pre_in.type(), + "The type of input is not consistent."); + + lod_tensors.emplace_back(in_sr); + } + + auto trg = out_var->GetMutable(); + trg->Resize(pre_in.dims()); + trg->mutable_data(out_var_handles[0]->place_, pre_in.type()); + + if (paddle::platform::is_cpu_place(pre_place)) { + ReduceLoDTensor func(lod_tensors, trg); + VisitDataType(ToDataType(lod_tensors[0].type()), func); + } else if (paddle::platform::is_gpu_place(pre_place)) { +#ifdef PADDLE_WITH_CUDA + auto out_p = out_var_handles[0]->place_; + int root = boost::get(out_p).device; + + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = in_places[i]; + auto &lod_tensor = lod_tensors[i]; + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + void *buffer = const_cast(lod_tensor.data()); + void *recvbuffer = nullptr; + if (root == dev_id) { + recvbuffer = trg->mutable_data(out_var_handles[0]->place_); + } + + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclReduce( + buffer, recvbuffer, static_cast(lod_tensor.numel()), + platform::ToNCCLDataType(lod_tensor.type()), ncclSum, root, comm, + stream)); + }); + } + + this->RunAndRecordEvent([&] { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + }); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); + } + } +} + +void ReduceOpHandle::WaitEvents( + const std::vector &in_var_handles) { + if (in_var_handles[0]->generated_op_) { + for (auto *in : in_var_handles) { + in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]); + } + } +} + +std::vector ReduceOpHandle::GetValidVarHandles( + const std::vector &inputs) { + std::vector in_var_handles; + for (auto *in : inputs) { + auto *in_handle = dynamic_cast(in); + if (in_handle) { + in_var_handles.push_back(in_handle); + } + } + return in_var_handles; +} +std::string ReduceOpHandle::Name() const { return "reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..7b36ce4a7bceaeb93ceb03730b2d54d0f36fed3d --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct ReduceOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + +#ifdef PADDLE_WITH_CUDA + const platform::NCCLContextMap *nccl_ctxs_; + ReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctxs) + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) { + if (nccl_ctxs_) { + for (auto &p_ctx : nccl_ctxs_->contexts_) { + dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + } + } + } +#else + ReduceOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} +#endif + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + std::vector GetValidVarHandles( + const std::vector &inputs); + + void WaitEvents(const std::vector &in_var_handles); +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed6a1355a3a4c4e5c0b70ef6cb705be0a768280f --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -0,0 +1,275 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestReduceOpHandle { + bool use_gpu_; + Scope g_scope_; + std::vector local_scopes_; + std::unique_ptr op_handle_; + std::vector> vars_; + std::vector gpu_list_; + std::vector> ctxs_; + +#ifdef PADDLE_WITH_CUDA + std::unique_ptr nccl_ctxs_; +#endif + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif + } + + void InitCtxOnGpu(bool use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } + nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_.reset(nullptr); +#endif + } + } + + void InitReduceOp(size_t input_scope_idx) { + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + local_scopes_[j]->Var("out"); + } + local_scopes_[input_scope_idx]->Var("input"); + + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset( + new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset( + new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); +#else + op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_)); +#endif + } + + // add input + for (size_t j = 0; j < gpu_list_.size(); ++j) { + if (!use_gpu_) { + op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); + } + auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]); + in_var_handle->generated_op_ = nullptr; + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + } + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle *in_dummy_var_handle = + static_cast(vars_.back().get()); + in_dummy_var_handle->generated_op_ = nullptr; + op_handle_->AddInput(in_dummy_var_handle); + + // add output + auto *out_var_handle = + new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle *dummy_var_handle = + static_cast(vars_.back().get()); + op_handle_->AddOutput(dummy_var_handle); + } + + void TestReduceSelectedRows(size_t output_scope_idx) { + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = local_scopes_[input_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + value->Resize(kDims); + } + + auto out_var = local_scopes_[output_scope_idx]->Var("out"); + auto out_selected_rows = out_var->GetMutable(); + + auto in_var = local_scopes_[output_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + + out_selected_rows->mutable_value()->ShareDataWith( + in_selected_rows->value()); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto &out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + float *ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { + ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); + } + } + + void TestReduceLodTensors(size_t output_scope_idx) { + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + f::LoD lod{{0, 10, 20}}; + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = local_scopes_[input_scope_idx]->Var("input"); + auto in_lod_tensor = in_var->GetMutable(); + in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); + in_lod_tensor->set_lod(lod); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); + } + + auto out_var = local_scopes_[output_scope_idx]->Var("out"); + auto out_lodtensor = out_var->GetMutable(); + + auto in_var = local_scopes_[output_scope_idx]->Var("input"); + auto in_lodtensor = in_var->Get(); + + out_lodtensor->ShareDataWith(in_lodtensor); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto &rt = out_var->Get(); + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + float *ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { + ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); + } + } +}; + +TEST(ReduceTester, TestCPUReduceTestSelectedRows) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceSelectedRows(input_scope_idx); +} +TEST(ReduceTester, TestCPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceLodTensors(input_scope_idx); +} +#ifdef PADDLE_WITH_CUDA + +TEST(ReduceTester, TestGPUReduceTestSelectedRows) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceSelectedRows(input_scope_idx); +} + +TEST(ReduceTester, TestGPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(input_scope_idx); + test_op.TestReduceLodTensors(input_scope_idx); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle