diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 21db93958a4a586c74a1e060f1f04b5af1dcd889..0858ec6a226cc8760918e1e7427f75c7fa0b7660 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -9,36 +9,37 @@ else() endif() configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) - return() -endif() - - -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc) +else() + set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) -brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc) -set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) -cc_test(brpc_server_test SRCS rpc_server_test.cc - DEPS ${brpc_test_depends} SERIAL) + cc_test(brpc_server_test SRCS rpc_server_test.cc + DEPS ${brpc_test_depends} SERIAL) -cc_test(brpc_serde_test SRCS brpc_serde_test.cc - DEPS ${brpc_test_depends} SERIAL) + cc_test(brpc_serde_test SRCS brpc_serde_test.cc + DEPS ${brpc_test_depends} SERIAL) +endif() diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7ba938cf100db12eb03b7adc56079957c9dd6f4 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +inline size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (id < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + +inline std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +inline std::vector> SplitIds( + const std::string& id_name, const std::vector& height_section, + framework::Scope* scope) { + auto& id_tensor = scope->Var(id_name)->Get(); + auto* id_data = id_tensor.data(); + std::set all_ids; + for (size_t i = 0; i < id_tensor.numel(); ++i) { + all_ids.insert(id_data[i]); + } + auto abs_sections = ToAbsoluteSection(height_section); + std::vector> splited_ids; + splited_ids.resize(height_section.size() + 1); + for (auto& id : all_ids) { + auto section_index = GetSectionIndex(id, abs_sections); + splited_ids[section_index].push_back(id - abs_sections[section_index]); + } + return splited_ids; +} + +inline void SplitIdsIntoMultipleVarsBySection( + const std::string& id_name, const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); + + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + auto* id_tensor = + scope->Var(in_var_names[i])->GetMutable(); + auto& ids = splited_ids[i]; + if (!ids.empty()) { + auto* id_tensor_data = id_tensor->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); + } + } +} + +inline void MergeMultipleVarsIntoOnBySection( + const std::string& id_name, const std::string& out_name, + const std::vector& out_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + const framework::ExecutionContext& context, framework::Scope* scope) { + PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, ""); + + auto cpu_place = platform::CPUPlace(); + + auto abs_sections = ToAbsoluteSection(height_section); + auto& id_tensor = scope->Var(id_name)->Get(); + auto* id_data = id_tensor.data(); + std::unordered_map> id_to_offset; + for (size_t i = 0; i < id_tensor.numel(); ++i) { + id_to_offset[id_data[i]].push_back(i); + } + + auto* out_tensor = scope->Var(out_name)->GetMutable(); + auto* out_tensor_data = out_tensor->mutable_data(context.GetPlace()); + + for (size_t section_idx = 0; section_idx < out_var_names.size(); + ++section_idx) { + auto& ids_in_this_section = splited_ids[section_idx]; + auto& prefetch_out_var = + scope->Var(out_var_names[section_idx])->Get(); + const auto* out_var_data = prefetch_out_var.data(); + auto& dims = prefetch_out_var.dims(); + + PADDLE_ENFORCE_EQ(dims.size(), 2, ""); + PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]); + + auto row_numel = dims[1]; + + for (size_t i = 0; i < dims[0]; ++i) { + auto id = ids_in_this_section[i]; + auto origin_id = id + abs_sections[section_idx]; + auto& offsets = id_to_offset[origin_id]; + for (auto& offset : offsets) { + // should support GPU tensor + memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place, + out_var_data + i * row_numel, sizeof(float) * row_numel); + } + } + } +} + +void prefetch(const std::string& id_name, const std::string& out_name, + const std::string& table_name, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context) { + auto& local_scope = context.scope().NewScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(context.GetPlace()); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < epmap.size(); ++i) { + in_var_names.push_back(id_name + "@" + epmap[i]); + out_var_names.push_back(out_name + "@" + epmap[i]); + } + + auto splited_ids = SplitIds(id_name, height_sections, &local_scope); + SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections, + splited_ids, &local_scope); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope.Var(name)->GetMutable(); + } + + std::vector rets; + for (size_t i = 0; i < in_var_names.size(); i++) { + if (NeedSend(local_scope, in_var_names[i])) { + VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] + << " to get " << out_var_names[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); + } else { + VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; + } + } + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + + MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, + height_sections, splited_ids, context, + &local_scope); + + context.scope().DeleteScope(&local_scope); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 03336367df09aaa96fcbbce4f4493424f7bd432e..9e680ec20bf382177d93bf7f08632d6af9fafe50 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -14,195 +14,20 @@ #pragma once -#include #include #include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" - -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -constexpr int64_t kNoPadding = -1; - -inline size_t GetSectionIndex(int64_t id, - const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (id < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -inline std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - -inline std::vector> SplitIds( - const std::string& id_name, const std::vector& height_section, - framework::Scope* scope) { - auto& id_tensor = scope->Var(id_name)->Get(); - auto* id_data = id_tensor.data(); - std::set all_ids; - for (size_t i = 0; i < id_tensor.numel(); ++i) { - all_ids.insert(id_data[i]); - } - auto abs_sections = ToAbsoluteSection(height_section); - std::vector> splited_ids; - splited_ids.resize(height_section.size() + 1); - for (auto& id : all_ids) { - auto section_index = GetSectionIndex(id, abs_sections); - splited_ids[section_index].push_back(id - abs_sections[section_index]); - } - return splited_ids; -} - -inline void SplitIdsIntoMultipleVarsBySection( - const std::string& id_name, const std::vector& in_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - framework::Scope* scope) { - PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); - - auto place = platform::CPUPlace(); - - for (size_t i = 0; i < in_var_names.size(); ++i) { - auto* id_tensor = - scope->Var(in_var_names[i])->GetMutable(); - auto& ids = splited_ids[i]; - if (!ids.empty()) { - auto* id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - -inline void MergeMultipleVarsIntoOnBySection( - const std::string& id_name, const std::string& out_name, - const std::vector& out_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - const framework::ExecutionContext& context, framework::Scope* scope) { - PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, ""); - - auto cpu_place = platform::CPUPlace(); - - auto abs_sections = ToAbsoluteSection(height_section); - auto& id_tensor = scope->Var(id_name)->Get(); - auto* id_data = id_tensor.data(); - std::unordered_map> id_to_offset; - for (size_t i = 0; i < id_tensor.numel(); ++i) { - id_to_offset[id_data[i]].push_back(i); - } - - auto* out_tensor = scope->Var(out_name)->GetMutable(); - auto* out_tensor_data = out_tensor->mutable_data(context.GetPlace()); - - for (size_t section_idx = 0; section_idx < out_var_names.size(); - ++section_idx) { - auto& ids_in_this_section = splited_ids[section_idx]; - auto& prefetch_out_var = - scope->Var(out_var_names[section_idx])->Get(); - const auto* out_var_data = prefetch_out_var.data(); - auto& dims = prefetch_out_var.dims(); - - PADDLE_ENFORCE_EQ(dims.size(), 2, ""); - PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]); - - auto row_numel = dims[1]; - - for (size_t i = 0; i < dims[0]; ++i) { - auto id = ids_in_this_section[i]; - auto origin_id = id + abs_sections[section_idx]; - auto& offsets = id_to_offset[origin_id]; - for (auto& offset : offsets) { - // should support GPU tensor - memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place, - out_var_data + i * row_numel, sizeof(float) * row_numel); - } - } - } -} - void prefetch(const std::string& id_name, const std::string& out_name, const std::string& table_name, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(context.GetPlace()); - - distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < epmap.size(); ++i) { - in_var_names.push_back(id_name + "@" + epmap[i]); - out_var_names.push_back(out_name + "@" + epmap[i]); - } - - auto splited_ids = SplitIds(id_name, height_sections, &local_scope); - SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections, - splited_ids, &local_scope); - - // create output var in local scope - for (auto& name : out_var_names) { - local_scope.Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(local_scope, in_var_names[i])) { - VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); - } else { - VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } - - MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, - height_sections, splited_ids, context, - &local_scope); - - context.scope().DeleteScope(&local_scope); -} + const framework::ExecutionContext& context); }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 1878dfe8a897db1b8c948d325fa48a38ca224a2b..74baa8a350625f566993009417e21c1d7765b7a2 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -87,6 +87,18 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "If the grad op reuse the input's variable.") .SetDefault(false); + + // for parameter prefetch + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({"127.0.0.1:6164"}); + AddComment(R"DOC( Lookup Table Operator. diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index e504c4f0cd5c0feaef4a251fad57b389a10a2ce7..69cae78b7059a7c34fd07bb34883ee34a2010297 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -23,6 +23,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" + namespace paddle { namespace operators {