diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 85b649b2937f6a281b9ee1fe7bae8101169f6102..897e41f79f4e3bb9cecbe7b42fc6c4fd3401d839 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) -cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) +cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory @@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) + +cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory) +cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory) + +cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory + device_context broadcast_op_handle) +cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory + device_context gather_op_handle) diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d29012380e1b1710704d71a28d21dcc3097eb51 --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broadcast_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +Tensor *GetTensorFromVar(Variable *in_var) { + if (in_var->IsType()) { + return in_var->GetMutable(); + } else if (in_var->IsType()) { + return in_var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + } + return nullptr; +} + +BroadcastOpHandle::BroadcastOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} + +void BroadcastOpHandle::RunImpl() { + // the input may have dummy var. + std::vector in_var_handle; + for (auto *in : inputs_) { + auto *out_handle = dynamic_cast(in); + if (out_handle) { + in_var_handle.push_back(out_handle); + } + } + PADDLE_ENFORCE_EQ(in_var_handle.size(), 1, + "The number of input should be one."); + + // the output may have dummy var. + std::vector out_var_handles; + for (auto *out : outputs_) { + auto *out_handle = dynamic_cast(out); + if (out_handle) { + out_var_handles.push_back(out_handle); + } + } + + PADDLE_ENFORCE_EQ( + out_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + + // Wait input done, this Wait is asynchronous operation + auto &in_place = in_var_handle[0]->place_; + if (in_var_handle[0]->generated_op_) { + for (auto *out : out_var_handles) { + auto &out_p = out->place_; + in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]); + } + } + + // + auto in_scope_idx = in_var_handle[0]->scope_idx_; + auto in_var = + local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_); + Tensor *in_tensor = GetTensorFromVar(in_var); + + for (auto *out : out_var_handles) { + auto &out_p = out->place_; + auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_); + + PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(), + "Places must be all on CPU or all on CUDA."); + + if (in_var->IsType()) { + auto &in_sr = in_var->Get(); + auto out_sr = out_var->GetMutable(); + if (&in_sr == out_sr) continue; + out_sr->set_height(in_sr.height()); + out_sr->set_rows(in_sr.rows()); + out_sr->mutable_value()->Resize(in_sr.value().dims()); + out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type()); + } else if (in_var->IsType()) { + auto in_lod = in_var->Get(); + auto out_lod = out_var->GetMutable(); + if (&in_lod == out_lod) continue; + out_lod->set_lod(in_lod.lod()); + out_lod->Resize(in_lod.dims()); + out_lod->mutable_data(out_p, in_lod.type()); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows."); + } + + Tensor *out_tensor = GetTensorFromVar(out_var); + paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]), + out_tensor); + } +} + +std::string BroadcastOpHandle::Name() const { return "broadcast"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..b3292422522b64a38a50f39f04e6f0d2e15492dd --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct BroadcastOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + + BroadcastOpHandle(const std::vector &local_scopes, + const std::vector &places); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..dfc52b012f8b6bf5cf1a3feab90dc1ec7842ad6c --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broadcast_op_handle.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestBroadcastOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + Scope g_scope_; + std::unique_ptr op_handle_; + std::vector> vars_; + std::vector gpu_list_; + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } + } + + void InitCtxOnGpu(bool use_gpu) { + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } + } + } + + void InitBroadcastOp(size_t input_scope_idx) { + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + local_scopes_[j]->Var("out"); + } + local_scopes_[input_scope_idx]->Var("input"); + + op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); + + vars_.emplace_back(new VarHandle()); + VarHandle* in_var_handle = static_cast(vars_.back().get()); + in_var_handle->place_ = gpu_list_[input_scope_idx]; + in_var_handle->name_ = "input"; + in_var_handle->version_ = 1; + in_var_handle->scope_idx_ = input_scope_idx; + in_var_handle->generated_op_ = nullptr; + op_handle_->AddInput(in_var_handle); + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back().get()); + dummy_var_handle->generated_op_ = nullptr; + op_handle_->AddInput(dummy_var_handle); + + for (size_t j = 0; j < gpu_list_.size(); ++j) { + op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); + vars_.emplace_back(new VarHandle()); + VarHandle* out_var_handle = static_cast(vars_.back().get()); + out_var_handle->place_ = gpu_list_[j]; + out_var_handle->name_ = "out"; + out_var_handle->version_ = 2; + out_var_handle->scope_idx_ = j; + op_handle_->AddOutput(out_var_handle); + } + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle* out_dummy_var_handle = + static_cast(vars_.back().get()); + out_dummy_var_handle->generated_op_ = nullptr; + op_handle_->AddOutput(out_dummy_var_handle); + } + + void TestBroadcastLodTensor(size_t input_scope_idx) { + auto in_var = local_scopes_[input_scope_idx]->Var("input"); + auto in_lod_tensor = in_var->GetMutable(); + in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); + + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + f::LoD lod{{0, 10, 20}}; + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); + in_lod_tensor->set_lod(lod); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + auto out_var = local_scopes_[j]->Var("out"); + auto out_tensor = out_var->Get(); + PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); + + f::Tensor result_tensor; + f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor); + float* ct = result_tensor.mutable_data(cpu_place); + + for (int64_t i = 0; i < f::product(kDims); ++i) { + ASSERT_NEAR(ct[i], send_vector[i], 1e-5); + } + } + } + + void TestBroadcastSelectedRows(size_t input_scope_idx) { + auto in_var = local_scopes_[input_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + int height = static_cast(kDims[0]) * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + auto out_var = local_scopes_[j]->Var("out"); + auto& out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, + "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t i = 0; i < f::product(kDims); ++i) { + ASSERT_NEAR(ct[i], send_vector[i], 1e-5); + } + } + } +}; + +TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastLodTensor(input_scope_idx); +} + +TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastSelectedRows(input_scope_idx); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastLodTensor(input_scope_idx); +} + +TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastSelectedRows(input_scope_idx); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..8dd85be567d33991ac003707fec939a61a2d0962 --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +GatherOpHandle::GatherOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} + +void GatherOpHandle::RunImpl() { + // the input may have dummy var. + std::vector in_var_handles; + for (auto *in : inputs_) { + auto *in_handle = dynamic_cast(in); + if (in_handle) { + in_var_handles.push_back(in_handle); + } + } + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + + // the output may have dummy var. + std::vector out_var_handles; + for (auto *out : outputs_) { + auto *out_handle = dynamic_cast(out); + if (out_handle) { + out_var_handles.push_back(out_handle); + } + } + PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, + "The number of output should be one."); + + auto in_0_handle = static_cast(in_var_handles[0]); + auto pre_in_var = + local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_); + auto pre_place = in_0_handle->place_; + + PADDLE_ENFORCE(pre_in_var->IsType(), + "Currently, gather_op only can gather SelectedRows."); + + PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(), + "The place of input and output should be the same."); + + // Wait input done, this Wait is asynchronous operation + for (auto *in : in_var_handles) { + if (in->generated_op_) { + in->generated_op_->Wait(dev_ctxes_[in->place_]); + } + } + + std::vector out_rows; + std::vector in_tensors; + std::vector in_places; + + auto &pre_in = pre_in_var->Get(); + // gather the inputs + for (auto *in : in_var_handles) { + auto in_handle = static_cast(in); + auto in_p = in_handle->place_; + in_places.push_back(in_p); + PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(), + "Places must be all on CPU or all on CUDA."); + auto in_var = + local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_); + auto &in_sr = in_var->Get(); + + PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(), + "The type of input is not consistent."); + PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(), + "The height of inputs is not consistent."); + PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), , + "The dims of inputs is not consistent."); + + auto in_sr_rows = in_sr.rows(); + out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); + + in_tensors.emplace_back(in_sr.value()); + } + + // write the output + auto &out_place = out_var_handles[0]->place_; + auto out_scope_idx = out_var_handles[0]->scope_idx_; + auto out_var = + local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_); + + auto out = out_var->GetMutable(); + out->set_height(pre_in.height()); + out->set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in.GetCompleteDims(); + out_dim[0] = static_cast(rows); + out->mutable_value()->Resize(out_dim); + out->mutable_value()->mutable_data(out_place, pre_in.value().type()); + Tensor *out_tensor = out->mutable_value(); + + // copy + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors.size(); ++j) { + e += in_tensors[j].dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(in_tensors[j], out_place, + *(dev_ctxes_[in_places[j]]), &sub_out); + s = e; + } +} + +std::string GatherOpHandle::Name() const { return "gather"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..6c0231f642c05e6b558b7e2518a15e08c816fe4b --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.h @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct GatherOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + + GatherOpHandle(const std::vector &local_scopes, + const std::vector &places); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..10839f239d59e97946575297a6d125968a1458f4 --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestGatherOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + Scope g_scope_; + std::unique_ptr op_handle_; + std::vector> vars_; + std::vector gpu_list_; + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } + } + + void InitCtxOnGpu(bool use_gpu) { + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } + } + } + + void InitGatherOp(size_t input_scope_idx) { + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + local_scopes_[j]->Var("out"); + } + local_scopes_[input_scope_idx]->Var("input"); + + op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_)); + // add input + for (size_t j = 0; j < gpu_list_.size(); ++j) { + op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); + vars_.emplace_back(new VarHandle()); + VarHandle* in_var_handle = static_cast(vars_.back().get()); + in_var_handle->place_ = gpu_list_[j]; + in_var_handle->name_ = "input"; + in_var_handle->version_ = 1; + in_var_handle->scope_idx_ = j; + in_var_handle->generated_op_ = nullptr; + op_handle_->AddInput(in_var_handle); + } + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle* in_dummy_var_handle = + static_cast(vars_.back().get()); + in_dummy_var_handle->generated_op_ = nullptr; + op_handle_->AddInput(in_dummy_var_handle); + + // add output + vars_.emplace_back(new VarHandle()); + VarHandle* out_var_handle = static_cast(vars_.back().get()); + out_var_handle->place_ = gpu_list_[input_scope_idx]; + out_var_handle->name_ = "out"; + out_var_handle->version_ = 2; + out_var_handle->scope_idx_ = input_scope_idx; + op_handle_->AddOutput(out_var_handle); + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back().get()); + op_handle_->AddOutput(dummy_var_handle); + } + + void TestGatherSelectedRows(size_t output_scope_idx) { + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = local_scopes_[input_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + value->Resize(kDims); + } + + auto out_var = local_scopes_[output_scope_idx]->Var("out"); + auto out_selected_rows = out_var->GetMutable(); + + auto in_var = local_scopes_[output_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + + out_selected_rows->mutable_value()->ShareDataWith( + in_selected_rows->value()); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto& out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); + } + } +}; + +TEST(GatherTester, TestCPUGatherTestSelectedRows) { + TestGatherOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitGatherOp(input_scope_idx); + test_op.TestGatherSelectedRows(input_scope_idx); +} + +#ifdef PADDLE_WITH_CUDA + +TEST(GatherTester, TestGPUGatherTestSelectedRows) { + TestGatherOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitGatherOp(input_scope_idx); + test_op.TestGatherSelectedRows(input_scope_idx); +} +#endif +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 569dda17c6e91d5658c4f8b9ba0b8c8fbd966832..871e41343f53b801a22d3a450f0906f37fb372d1 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase { // version field currently is not used, however, just store the version to // debug easily. size_t version_; + size_t scope_idx_; std::string name_; platform::Place place_; }; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d864af011bced9df188147ec436b8de12947ba9..d1b01ae05b808b229309e9689165483a11530c84 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -11,8 +11,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include namespace paddle { namespace framework { @@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, reinterpret_cast(ctx).stream());