add parameter prefetch

4ad5fd8f · Qiao Longfei · 9d276fe8 · 4ad5fd8f · 4ad5fd8f · 4ad5fd8f
5 changed file
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,36 +9,37 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
      PROTO send_recv.proto 
      DEPS lod_tensor selected_rows memory)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  return()
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc)
-endif()
+else()
+  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+    PROTO send_recv.proto
+    DEPS lod_tensor selected_rows memory)
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc)
-    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
-  PROTO send_recv.proto
-  DEPS lod_tensor selected_rows memory)
-set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
-cc_test(brpc_server_test SRCS rpc_server_test.cc 
+  cc_test(brpc_server_test SRCS rpc_server_test.cc
-    DEPS ${brpc_test_depends} SERIAL)
+      DEPS ${brpc_test_depends} SERIAL)
-cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
+  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
-    DEPS ${brpc_test_depends} SERIAL)
+      DEPS ${brpc_test_depends} SERIAL)
+endif()
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <set>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+namespace paddle {
+namespace operators {
+namespace distributed {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+constexpr int64_t kNoPadding = -1;
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+inline std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+inline std::vector<std::vector<int64_t>> SplitIds(
+    const std::string& id_name, const std::vector<int64_t>& height_section,
+    framework::Scope* scope) {
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::set<int64_t> all_ids;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    all_ids.insert(id_data[i]);
+  }
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+inline void SplitIdsIntoMultipleVarsBySection(
+    const std::string& id_name, const std::vector<std::string>& in_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+  auto place = platform::CPUPlace();
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+inline void MergeMultipleVarsIntoOnBySection(
+    const std::string& id_name, const std::string& out_name,
+    const std::vector<std::string>& out_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
+  auto cpu_place = platform::CPUPlace();
+  auto abs_sections = ToAbsoluteSection(height_section);
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    id_to_offset[id_data[i]].push_back(i);
+  }
+  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    auto& prefetch_out_var =
+        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    const auto* out_var_data = prefetch_out_var.data<float>();
+    auto& dims = prefetch_out_var.dims();
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+    auto row_numel = dims[1];
+    for (size_t i = 0; i < dims[0]; ++i) {
+      auto id = ids_in_this_section[i];
+      auto origin_id = id + abs_sections[section_idx];
+      auto& offsets = id_to_offset[origin_id];
+      for (auto& offset : offsets) {
+        // should support GPU tensor
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
+                     out_var_data + i * row_numel, sizeof(float) * row_numel);
+      }
+    }
+  }
+}
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::string& table_name,
+              const std::vector<std::string>& epmap,
+              const std::vector<int64_t>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(context.GetPlace());
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
+                                    splited_ids, &local_scope);
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+               << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+    } else {
+      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_sections, splited_ids, context,
+                                   &local_scope);
+  context.scope().DeleteScope(&local_scope);
+}
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -14,195 +14,20 @@
 #pragma once
-#include <set>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
 namespace paddle {
 namespace operators {
 namespace distributed {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-constexpr int64_t kNoPadding = -1;
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-inline std::vector<std::vector<int64_t>> SplitIds(
-    const std::string& id_name, const std::vector<int64_t>& height_section,
-    framework::Scope* scope) {
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::set<int64_t> all_ids;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    all_ids.insert(id_data[i]);
-  }
-  auto abs_sections = ToAbsoluteSection(height_section);
-  std::vector<std::vector<int64_t>> splited_ids;
-  splited_ids.resize(height_section.size() + 1);
-  for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id, abs_sections);
-    splited_ids[section_index].push_back(id - abs_sections[section_index]);
-  }
-  return splited_ids;
-}
-inline void SplitIdsIntoMultipleVarsBySection(
-    const std::string& id_name, const std::vector<std::string>& in_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
-  auto place = platform::CPUPlace();
-  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor =
-        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
-    auto& ids = splited_ids[i];
-    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-inline void MergeMultipleVarsIntoOnBySection(
-    const std::string& id_name, const std::string& out_name,
-    const std::vector<std::string>& out_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    const framework::ExecutionContext& context, framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
-  auto cpu_place = platform::CPUPlace();
-  auto abs_sections = ToAbsoluteSection(height_section);
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    id_to_offset[id_data[i]].push_back(i);
-  }
-  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
-  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
-  for (size_t section_idx = 0; section_idx < out_var_names.size();
-       ++section_idx) {
-    auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var =
-        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
-    const auto* out_var_data = prefetch_out_var.data<float>();
-    auto& dims = prefetch_out_var.dims();
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
-    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
-    auto row_numel = dims[1];
-    for (size_t i = 0; i < dims[0]; ++i) {
-      auto id = ids_in_this_section[i];
-      auto origin_id = id + abs_sections[section_idx];
-      auto& offsets = id_to_offset[origin_id];
-      for (auto& offset : offsets) {
-        // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     out_var_data + i * row_numel, sizeof(float) * row_numel);
-      }
-    }
-  }
-}
 void prefetch(const std::string& id_name, const std::string& out_name,
              const std::string& table_name,
              const std::vector<std::string>& epmap,
              const std::vector<int64_t>& height_sections,
-              const framework::ExecutionContext& context) {
+              const framework::ExecutionContext& context);
-  auto& local_scope = context.scope().NewScope();
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(context.GetPlace());
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < epmap.size(); ++i) {
-    in_var_names.push_back(id_name + "@" + epmap[i]);
-    out_var_names.push_back(out_name + "@" + epmap[i]);
-  }
-  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
-                                    splited_ids, &local_scope);
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-  }
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(local_scope, in_var_names[i])) {
-      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
-               << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-    } else {
-      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-  }
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                   height_sections, splited_ids, context,
-                                   &local_scope);
-  context.scope().DeleteScope(&local_scope);
-}
 };  // namespace distributed
 };  // namespace operators

--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -87,6 +87,18 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "If the grad op reuse the input's variable.")
        .SetDefault(false);
+    // for parameter prefetch
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
    AddComment(R"DOC(
 Lookup Table Operator.

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,6 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
 namespace paddle {
 namespace operators {