From 1abd3b3a29b6964323d679d47dea31830f5b5e6a Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 27 Nov 2017 19:28:57 +0800
Subject: [PATCH] implement forward

---
 paddle/operators/CMakeLists.txt             |  4 +-
 paddle/operators/hierarchical_sigmoid_op.cc |  6 +-
 paddle/operators/hierarchical_sigmoid_op.h  | 39 ++++++++++-
 paddle/operators/math/math_function.cc      |  2 +
 paddle/operators/math/math_function.cu      |  2 +
 paddle/operators/math/math_function.h       |  6 ++
 paddle/operators/math/math_function_impl.h  | 14 ++++
 paddle/operators/math/matrix_bit_code.cc    | 77 +++++++++++++++++++--
 paddle/operators/math/matrix_bit_code.h     | 19 ++++-
 9 files changed, 157 insertions(+), 12 deletions(-)
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a719da2560..93ec763424 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -185,7 +185,8 @@ set(DEPS_OPS
     tensor_array_read_write_op
     gru_op
     adagrad_op
-    sgd_op)
+    sgd_op
+    hierarchical_sigmoid_op)
 
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
@@ -203,6 +204,7 @@ op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
 op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
diff --git a/paddle/operators/hierarchical_sigmoid_op.cc b/paddle/operators/hierarchical_sigmoid_op.cc
index 9b7af92662..f81f3d34d1 100644
--- a/paddle/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/operators/hierarchical_sigmoid_op.cc
@@ -85,12 +85,16 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
              "(TensorArray, required) The input array. Each Tensor has the "
              "same shape with [N * D].")
         .AsDuplicable();
+    AddInput("Parameters",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is s a 2-D tensor.")
+        .AsDuplicable();
     AddInput("Label",
              "(Tensor, required), The labels of training data. It's a"
              "1-D tensor.");
     AddInput("Bias",
              "(Tensor, optional), The bias is a 1-D tensor, "
-             "which is applied to the output");
+             "which is applied to the output.");
     AddOutput(
         "Out",
         "(Tensor, required) The output of hierarchical sigmoid operator.");
diff --git a/paddle/operators/hierarchical_sigmoid_op.h b/paddle/operators/hierarchical_sigmoid_op.h
index 11a553a403..baf655f214 100644
--- a/paddle/operators/hierarchical_sigmoid_op.h
+++ b/paddle/operators/hierarchical_sigmoid_op.h
@@ -14,28 +14,61 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/matrix_bit_code.h"
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
 class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto params = ctx.MultiInput<framework::Tensor>("Parameters");
     auto* label = ctx.Input<framework::Tensor>("Label");
     auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+
+    framework::Tensor sum;
+    framework::Tensor pre_out;
+    auto place = ctx.GetEigenDevice<Place>();
+    auto& device_ctx = ctx.device_context();
+    math::ColwiseSum<Place, T> col_sum;
+    math::RowwiseSum<Place, T> row_sum;
+
+    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
     int64_t batch_size = ins[0]->dims()[0];
     int64_t size = ins.size();
-    framework::Tensor pre_out;
+
     std::vector<int64_t> pre_out_dims({batch_size, size});
     pre_out.mutable_data<T>(framework::make_ddim(pre_out_dims), ctx.GetPlace());
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    out->mutable_data<T>(ctx.GetPlace());
 
-    if (bias != NULL) {
+    if (bias) {
       math::AddByBitCode<T>(num_classes, *label, pre_out, *bias);
     }
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      math::MulByBitCode<T>(num_classes, *label, pre_out, *params[i], *ins[i]);
+    }
+    // clip the matrix with (-40, 40)
+    pre_out_mat.device(place) =
+        pre_out_mat.abs().cwiseMax(static_cast<T>(40.0));
+    math::SumByBitCode<T>(num_classes, *label, *out, pre_out,
+                          static_cast<T>(-1));
+    // softrelu
+    pre_out_mat.device(place) = (static_cast<T>(1) + pre_out_mat.exp()).log();
+
+    row_sum(device_ctx, pre_out, &sum);
+    col_sum(device_ctx, *out, &sum);
   }
 };
 
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2e333a8cde..3bc0945fe3 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -314,6 +314,8 @@ template struct RowwiseAdd<platform::CPUPlace, float>;
 template struct RowwiseAdd<platform::CPUPlace, double>;
 template struct ColwiseSum<platform::CPUPlace, float>;
 template struct ColwiseSum<platform::CPUPlace, double>;
+template struct RowwiseSum<platform::CPUPlace, float>;
+template struct RowwiseSum<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 58356a4b77..1a226821f7 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -298,6 +298,8 @@ template struct RowwiseAdd<platform::GPUPlace, float>;
 template struct RowwiseAdd<platform::GPUPlace, double>;
 template struct ColwiseSum<platform::GPUPlace, float>;
 template struct ColwiseSum<platform::GPUPlace, double>;
+template struct RowwiseSum<platform::GPUPlace, float>;
+template struct RowwiseSum<platform::GPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index ffb99f5380..c21a20fc32 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -130,6 +130,12 @@ struct ColwiseSum {
                   const framework::Tensor& input, framework::Tensor* vec);
 };
 
+template <typename Place, typename T>
+struct RowwiseSum {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 4dc17a4e52..8c1971fc61 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -78,6 +78,20 @@ void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
       in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
 }
 
+template <typename Place, typename T>
+void RowwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
+                                      const framework::Tensor& input,
+                                      framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[1];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenMatrix<T>::From(*vector);
+  Eigen::array<int, 2> shape({{static_cast<int>(size), 1}});
+  vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
+      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/matrix_bit_code.cc b/paddle/operators/math/matrix_bit_code.cc
index 30c2ffc2cf..8f68e2f79d 100644
--- a/paddle/operators/math/matrix_bit_code.cc
+++ b/paddle/operators/math/matrix_bit_code.cc
@@ -53,18 +53,18 @@ namespace math {
 template <class CodeTable, class Op, typename T>
 static void AddByBitCodeT(Op op, CodeTable code_table,
                           const framework::Tensor& codes, framework::Tensor& a,
-                          framework::Tensor& b) {
+                          const framework::Tensor& b) {
   size_t num_classes = code_table.size();
   size_t max_code_length = code_table.get_max_code_length();
-  size_t num_sample = a.dims()[0].size();
-  size_t width = a.dims()[1].size();
+  size_t num_sample = a.dims()[0];
+  size_t width = a.dims()[1];
 
   for (size_t i = 0; i < num_sample; ++i) {
-    auto code = code_table(codes.data<T>()[i]) int code_length =
-        code.get_length();
+    auto code = code_table(codes.data<T>()[i]);
+    int code_length = code.get_length();
     for (int j = 0; j < code_length; + j) {
       size_t index = code.calc_index(j);
-      op(a<T>.data()[i * width + j], b<T>.data()[index]);
+      op(a.data<T>()[i * width + j], b.data<T>()[index]);
     }
   }
 }
@@ -79,6 +79,71 @@ void AddByBitCode(size_t num_classes, const framework::Tensor& codes,
   AddByBitCodeT<T>(op, SimpleCodeTable(num_classes), codes, a, b);
 }
 
+template <class CodeTable, typename T>
+void SumByBitCodeT(CodeTable code_table, const framework::Tensor& codes,
+                   framework::Tensor& tmat, framework::Tensor& sum,
+                   const T& scale_sum) {
+  size_t max_code_length = code_table.get_max_code_length();
+  size_t num_samples = tmat.dims()[0];
+  size_t o_width = tmat.dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    T sm = 0;
+    auto code = code_table(codes.data<T>()[i]);
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        sm += tmat.data<T>()[i * o_width + j];
+      }
+    }
+    sum.data<T>()[i] = scale_sum * sm;
+  }
+}
+/* For j < codeLength:
+    sum(i, 0) = \sum_j bit(i, j) * input(i, j)
+*/
+template <typename T>
+void SumByBitCode(size_t num_classes, const framework::Tensor& codes,
+                  framework::Tensor& tmat, framework::Tensor& sum,
+                  T scale_sum) {
+  SumByBitCodeT(SimpleCodeTable(num_classes), codes, tmat, scale_sum);
+}
+
+template <class Op, class CodeTable, typename T>
+void MulByBitCodeT(Op op, CodeTable code_table, const framework::Tensor& codes,
+                   framework::Tensor& tmat, framework::Tensor& weight,
+                   framework::Tensor& input) {
+  size_t num_classes = code_table.size();
+  size_t max_code_length = code_table.get_max_code_length();
+  size_t num_samples = tmat.dims()[0];
+  size_t input_dim = input.dims()[1];
+  size_t o_width = tmat.dims()[1];
+
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(codes.data<T>()[i]);
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      op(tmat.data<T>()[i * o_width + j],
+         weight.data<T>() + index * weight.dims()[1],
+         input.data<T>() + i * input.dims()[1], input_dim);
+    }
+  }
+}
+
+template <typename T>
+void MulByBitCode(size_t num_classes, const framework::Tensor& codes,
+                  framework::Tensor& tmat, const framework::Tensor& weight,
+                  const framework::Tensor& input) {
+  auto op = [](T& t, const T* weight_row, const T* input_row,
+               size_t input_dim) {
+    T sum = 0;
+    for (size_t k = 0; k < input_dim; ++k) {
+      sum += weight_row[k] * input_row[k];
+    }
+    t += sum;
+  };
+  MulByBitCode(op, SimpleCodeTable(num_classes), codes, tmat, weight, input);
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/matrix_bit_code.h b/paddle/operators/math/matrix_bit_code.h
index bb0599aa17..7bef5077b9 100644
--- a/paddle/operators/math/matrix_bit_code.h
+++ b/paddle/operators/math/matrix_bit_code.h
@@ -59,10 +59,27 @@ struct SimpleCodeTable {
   int max_code_length_;
 };
 
+/* For j < codeLength
+    tmat(i, j) += vec(0, index(i, j))
+*/
 template <typename T>
 void AddByBitCode(size_t num_classes, const framework::Tensor& codes,
-                  framework::Tensor& a, const framework::Tensor& b);
+                  framework::Tensor& tmat, const framework::Tensor& vec);
 
+/* For j < codeLength
+    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
+*/
+template <typename T>
+void SumByBitCode(size_t num_classes, const framework::Tensor& codes,
+                  framework::Tensor& tmat, framework::Tensor& sum, T scale_sum);
+
+/* For j < codeLength
+    input.row(i) += tmat(i, j) * weight.row(index(i, j))
+*/
+template <typename T>
+void MulByBitCode(size_t num_classes, const framework::Tensor& codes,
+                  framework::Tensor& tmat, const framework::Tensor& weight,
+                  const framework::Tensor& input);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-- 
GitLab