add lookup remote table op

60a4f69b · Qiao Longfei · e0b48f7e · 60a4f69b · 60a4f69b
2 changed file
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+namespace paddle {
+namespace operators {
+class LookupRemoteTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of LookupRemoteTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of LookupRemoteTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupRemoteTableOp should not be null.");
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Lookup Remote Table Operator.
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp,
+                  ops::EmptyGradOpMaker, ops::LookupRemoteTableOpMaker);
+REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel<float>,
+                       ops::LookupRemoteTableKernel<double>);
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table.h
@@ -14,21 +14,22 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
-#include <vector>
 #include <set>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 namespace paddle {
 namespace operators {
 namespace distributed {
-inline size_t GetSectionIndex(int64_t id, const std::vector<int64_t>& abs_sections) {
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
  for (size_t i = 1; i < abs_sections.size(); ++i) {
    if (row < abs_sections[i]) {
      return i - 1;
@@ -49,8 +50,7 @@ inline std::vector<int64_t> ToAbsoluteSection(
 }
 inline std::vector<std::vector<int64_t>> SplitIds(
-        const std::string& id_name,
+    const std::string& id_name, const std::vector<int64_t>& height_section,
-        const std::vector<int64_t>& height_section,
    framework::Scope* scope) {
  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
  auto* id_data = id_tensor.data<int64_t>();
@@ -68,8 +68,7 @@ inline std::vector<std::vector<int64_t>> SplitIds(
 }
 inline void SplitIdsIntoMultipleVarsBySection(
-        const std::string& id_name,
+    const std::string& id_name, const std::vector<std::string>& in_var_names,
-        const std::vector<std::string>& in_var_names,
    const std::vector<int64_t>& height_section,
    const std::vector<std::vector<int64_t>>& splited_ids,
    framework::Scope* scope) {
@@ -78,18 +77,19 @@ inline void SplitIdsIntoMultipleVarsBySection(
  auto place = platform::CPUPlace();
  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor = scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
    auto& ids = splited_ids[i];
    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(framework::make_ddim({ids.size(), 1}), place);
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({ids.size(), 1}), place);
      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
    }
  }
 }
 inline void MergeMultipleVarsIntoOnBySection(
-        const std::string& id_name,
+    const std::string& id_name, const std::string& out_name,
-        const std::string& out_name,
    const std::vector<std::string>& out_var_names,
    const std::vector<int64_t>& height_section,
    const std::vector<std::vector<int64_t>>& splited_ids,
@@ -109,9 +109,11 @@ inline void MergeMultipleVarsIntoOnBySection(
  auto& out_tensor = scope->Var(out_name)->Get<framework::LoDTensor>();
  auto* out_tensor_data = out_tensor.mutable_data<float>();
-  for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) {
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
    auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    auto& prefetch_out_var =
+        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
    const auto* out_var_data = prefetch_out_var.mutable_data<float>();
    auto& dims = prefetch_out_var.dims();
@@ -126,31 +128,27 @@ inline void MergeMultipleVarsIntoOnBySection(
      auto& offsets = id_to_offset[origin_id];
      for (auto& offset : offsets) {
        // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     cpu_place, out_var_data + i * grad_row_numel,
+                     out_var_data + i * grad_row_numel,
                     sizeof(T) * grad_row_numel);
      }
    }
  }
 }
-inline void prefetch(
+inline void prefetch(const std::string& table_name, const std::string& id_name,
-        const std::string& table_name,
-        const std::string& id_name,
                     const std::string& out_name,
                     const std::vector<std::string>& epmap,
                     const std::vector<int64_t>& height_section,
                     const framework::Scope& scope,
                     const platform::Place& place) const {
  auto local_scope = scope.NewScope();
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& ctx = *pool.Get(place);
  distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(Attr<int>("trainer_id"));
-                  Attr<int>("trainer_id"));
  std::vector<std::string> in_var_names;
  std::vector<std::string> out_var_names;
@@ -160,7 +158,8 @@ inline void prefetch(
  }
  auto splited_ids = SplitIds(id_name, height_section, local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, splited_ids, local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section,
+                                    splited_ids, local_scope);
  // create output var in local scope
  for (auto& name : out_var_names) {
@@ -172,8 +171,8 @@ inline void prefetch(
    if (NeedSend(local_scope, ins[i])) {
      VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
               << outs[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, local_scope,
+      rets.push_back(rpc_client->AsyncPrefetchVar(
-                                                  in_var_names[i], out_var_names[i]));
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
    } else {
      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
    }
@@ -182,11 +181,71 @@ inline void prefetch(
    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
  }
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, height_section, plited_ids, scope)
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_section, plited_ids, scope)
      scope.DeleteScope(local_scope);
 }
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+constexpr int64_t kNoPadding = -1;
+template <typename T>
+class LookupRemoteTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    auto* table_var = context.InputVar("W");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t* ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
+    if (table_var->IsType<LoDTensor>()) {
+      auto* table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
+      auto* table = table_t->data<T>();
+      auto* output = output_t->mutable_data<T>(context.GetPlace());
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
+      }
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto& table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto* table = table_t.value().data<T>();
+      auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
+        }
+      }
+    }
+  }
+};
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle