From c3c3c0b33cf9100dd8f90f039ef0f130f53bafef Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 27 Nov 2018 09:24:53 +0000
Subject: [PATCH] polish code, test=develop

---
 paddle/fluid/framework/mixed_vector.h         |  6 --
 paddle/fluid/framework/selected_rows.cc       | 52 ++++++++++
 paddle/fluid/framework/selected_rows.h        | 55 ++---------
 .../operators/hierarchical_sigmoid_op.cc      |  2 +-
 .../fluid/operators/hierarchical_sigmoid_op.h | 79 ++++++++--------
 .../fluid/operators/math/matrix_bit_code.cc   | 62 ++++++------
 paddle/fluid/operators/math/matrix_bit_code.h | 94 +++++++++----------
 python/paddle/fluid/layers/nn.py              |  2 +-
 .../fluid/tests/unittests/test_hsigmoid_op.py |  6 +-
 9 files changed, 176 insertions(+), 182 deletions(-)
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 21118c4fc9..6940250c3f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -488,12 +488,6 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     return os;
   }
 
-  size_t size() const noexcept {
-    size_t size =
-        static_cast<size_t>(std::vector<T, std::allocator<T>>::size());
-    return size;
-  }
-
   T &operator[](size_t id) { return this->at(id); }
 
   const T &operator[](size_t id) const { return this->at(id); }
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 7262f8cc05..f4f2b769d5 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -140,6 +140,58 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      int row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
 void SelectedRows::SyncIndex() {
   rwlock_->WRLock();
   id_to_index_.clear();
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index bc5726382f..44384082db 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -118,54 +118,17 @@ class SelectedRows {
    *
    * @return index of the key.
    */
-  inline int64_t AutoGrownIndex(int64_t key, bool auto_grown,
-                                bool is_test = false) {
-    if (is_test) {
-      auto iter = id_to_index_.find(key);
-      if (iter == id_to_index_.end()) {
-        return -1;
-      } else {
-        return iter->second;
-      }
-    }
-    rwlock_->RDLock();
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
+
+  /*
+   * @brief Get the index of the key from id_to_index_ map.
+   */
+  inline int64_t GetIndexFromId(int64_t key) {
     auto iter = id_to_index_.find(key);
     if (iter == id_to_index_.end()) {
-      rwlock_->UNLock();
-      if (!auto_grown) {
-        PADDLE_THROW("key %d not found", key);
-      }
-      rwlock_->WRLock();
-      auto map_size = id_to_index_.size();
-      auto vector_size = rows_.size();
-      if (map_size != vector_size) {
-        rwlock_->UNLock();
-        PADDLE_THROW(
-            "id_to_index_ size %d should have the same size with rows_ %d",
-            map_size, vector_size);
-      }
-      auto write_iter = id_to_index_.find(key);
-      if (write_iter == id_to_index_.end()) {
-        int row_num = rows_.size();
-        if (row_num == value_->dims()[0]) {
-          rwlock_->UNLock();
-          PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
-        }
-        // key logic to put a key into id_to_index_
-        rows_.push_back(key);
-        auto index = static_cast<int64_t>(rows_.size() - 1);
-        id_to_index_[key] = index;
-        rwlock_->UNLock();
-        return index;
-      } else {
-        auto index = write_iter->second;
-        rwlock_->UNLock();
-        return index;
-      }
+      return -1;
     } else {
-      auto index = iter->second;
-      rwlock_->UNLock();
-      return index;
+      return iter->second;
     }
   }
 
@@ -185,7 +148,7 @@ class SelectedRows {
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unordered_map<int64_t, int64_t>
-      id_to_index_;  // should not be used when ids has duplicate member
+      id_to_index_;  // should not be used when rows_ has duplicate member
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;  // height indicates the underline tensor's height
   std::unique_ptr<RWLock> rwlock_{nullptr};
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index f3329c4855..5b09958e73 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -101,7 +101,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
              "it should have shape like [N, L], L is the length of the Path")
         .AsDispensable();
     AddInput(
-        "PCode",
+        "PathCode",
         "(LoDTensor, optional), The Code on each Node of the Path from root "
         "to current word"
         "it should have shape like [N, L], L is the length of the Path")
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index de219bacdd..6cb011611d 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -19,9 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
+
 namespace paddle {
 namespace operators {
 
@@ -30,31 +32,26 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 using platform::Transform;
 
-std::vector<int64_t> cal_rows(const framework::LoDTensor& path) {
-  std::set<int64_t> tmp;
-  std::vector<int64_t> rows;
-  for (size_t i = 0; i < static_cast<size_t>(path.dims()[0]); i++) {
-    for (size_t j = 0; j < static_cast<size_t>(path.dims()[1]); j++) {
-      int64_t temp =
-          path.data<int64_t>()[i * static_cast<size_t>(path.dims()[1]) + j];
-      if (temp >= 0) {
-        tmp.insert(temp);
-      }
+static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
+  std::set<int64_t> rows;
+  for (int64_t i = 0; i < path.numel(); ++i) {
+    int64_t row = path.data<int64_t>()[i];
+    if (row < 0) {
+      continue;
     }
+    rows.emplace(row);
   }
-  rows.assign(tmp.begin(), tmp.end());
-  return rows;
+  return std::vector<int64_t>(rows.begin(), rows.end());
 }
-
 template <typename DeviceContext, typename T>
 class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* w = ctx.Input<framework::LoDTensor>("W");
+    auto in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
     auto* path = ctx.Input<framework::LoDTensor>("PTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PCode");
-    auto* label = ctx.Input<framework::LoDTensor>("Label");
+    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
+    auto label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
     auto* bias = ctx.Input<framework::LoDTensor>("Bias");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
@@ -65,7 +62,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     }
     int64_t code_length =
         path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in->dims()[0];
+    int64_t batch_size = in.dims()[0];
     framework::LoDTensor sum;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto* pre_out_data = pre_out->mutable_data<T>(
@@ -81,10 +78,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
     if (!is_custom) {
       bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label->data<int64_t>()));
+                                                       label.data<int64_t>()));
     } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
-                                                       label->data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
+                                                       label.data<int64_t>()));
     }
 
     std::vector<int64_t> sum_dims({batch_size, 1UL});
@@ -95,7 +92,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     if (bias) {
       bit_code->Add(*bias, pre_out);
     }
-    bit_code->Mul(pre_out, *w, *in);
+    bit_code->Mul(pre_out, w, in);
     // clip to [-40, 40]
     Transform<DeviceContext> trans;
     trans(ctx.template device_context<DeviceContext>(), pre_out_data,
@@ -117,23 +114,23 @@ template <typename DeviceContext, typename T>
 class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* w = ctx.Input<framework::LoDTensor>("W");
+    auto in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
     auto* path = ctx.Input<framework::LoDTensor>("PTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PCode");
+    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
     auto* bias = ctx.Input<framework::LoDTensor>("Bias");
     auto* in_grad =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     bool is_sparse = ctx.Attr<bool>("is_sparse");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
-    auto* label = ctx.Input<framework::LoDTensor>("Label");
-    auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
-    auto* out_grad =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
+    auto pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
+    auto out_grad = detail::Ref(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
     framework::LoDTensor pre_out_grad;
 
-    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
     in_grad->mutable_data<T>(ctx.GetPlace());
     zero(dev_ctx, in_grad, static_cast<T>(0.0));
 
@@ -147,16 +144,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
     if (!is_custom) {
       bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label->data<int64_t>()));
+                                                       label.data<int64_t>()));
     } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
-                                                       label->data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
+                                                       label.data<int64_t>()));
     }
 
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
     auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
 
     Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
 
@@ -181,17 +178,17 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
           ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
       w_grad->mutable_data<T>(ctx.GetPlace());
       zero(dev_ctx, w_grad, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
+      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     } else {
-      framework::Vector<int64_t> real_rows = cal_rows(*path);
+      framework::Vector<int64_t> real_rows = PathToRows(*path);
       auto* w_grad =
           ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
       w_grad->set_rows(real_rows);
       // Build a map of id -> row_index to speed up finding the index of one id
       w_grad->SyncIndex();
-      w_grad->set_height(w->dims()[0]);
+      w_grad->set_height(w.dims()[0]);
       auto* w_grad_value = w_grad->mutable_value();
-      framework::DDim temp_dim(w->dims());
+      framework::DDim temp_dim(w.dims());
       set(temp_dim, 0, real_rows.size());
 
       w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
@@ -211,9 +208,9 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
         zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
         bit_code->AddGrad(pre_out_grad, bias_grad);
       }
-      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
+      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     }
-    bit_code->MulGradError(pre_out_grad, *w, in_grad);
+    bit_code->MulGradError(pre_out_grad, w, in_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 297e8d850b..71b9293eed 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -19,12 +19,12 @@ namespace operators {
 namespace math {
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::LoDTensor& vec,
-                                  framework::LoDTensor* tmat) {
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
+                                  framework::Tensor* tmat) {
   size_t batch_size = tmat->dims()[0];
   size_t width = tmat->dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
@@ -34,12 +34,12 @@ void MatrixBitCodeFunctor<T>::Add(const framework::LoDTensor& vec,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
-                                      framework::LoDTensor* vec) {
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::Tensor* vec) {
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
@@ -49,17 +49,16 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
                                       framework::SelectedRows* vec) {
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
-      int64_t row_index =
-          vec->AutoGrownIndex(static_cast<int64_t>(index), false, true);
+      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
       vec->mutable_value()->data<T>()[row_index] +=
           tmat.data<T>()[i * width + j];
     }
@@ -67,13 +66,13 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
-                                  framework::LoDTensor* sum, T scale_sum) {
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+                                  framework::Tensor* sum, T scale_sum) {
   size_t num_samples = tmat.dims()[0];
   size_t o_width = tmat.dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
     T sm = static_cast<T>(0.0);
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       if (code->calc_bit(j)) {
@@ -87,9 +86,9 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::LoDTensor* tmat,
-                                  const framework::LoDTensor& weight,
-                                  const framework::LoDTensor& input) {
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+                                  const framework::Tensor& weight,
+                                  const framework::Tensor& input) {
   size_t num_samples = tmat->dims()[0];
   size_t tmat_width = tmat->dims()[1];
   size_t input_width = input.dims()[1];
@@ -98,7 +97,7 @@ void MatrixBitCodeFunctor<T>::Mul(framework::LoDTensor* tmat,
   auto weight_value = weight.data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
@@ -113,9 +112,9 @@ void MatrixBitCodeFunctor<T>::Mul(framework::LoDTensor* tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
-                                            framework::LoDTensor* weight,
-                                            const framework::LoDTensor& input) {
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::Tensor* weight,
+                                            const framework::Tensor& input) {
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -124,7 +123,7 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
   auto weight_value = weight->data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
@@ -138,9 +137,9 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                             framework::SelectedRows* weight,
-                                            const framework::LoDTensor& input) {
+                                            const framework::Tensor& input) {
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -149,13 +148,12 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
   auto weight_value = weight->mutable_value()->data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
       for (size_t k = 0; k < input_width; ++k) {
-        int64_t row_index =
-            weight->AutoGrownIndex(static_cast<int64_t>(index), false, true);
+        int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
         weight_value[row_index * weight_width + k] +=
             tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
       }
@@ -164,9 +162,9 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::LoDTensor& tmat,
-                                           const framework::LoDTensor& weight,
-                                           framework::LoDTensor* input) {
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+                                           const framework::Tensor& weight,
+                                           framework::Tensor* input) {
   size_t num_samples = tmat.dims()[0];
   size_t tmat_width = tmat.dims()[1];
   size_t input_width = input->dims()[1];
@@ -176,7 +174,7 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::LoDTensor& tmat,
   auto input_value = input->data<T>();
 
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
@@ -191,11 +189,11 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::LoDTensor& tmat,
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::LoDTensor* tmat) {
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
   size_t num_samples = tmat->dims()[0];
   size_t o_width = tmat->dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table->get_code(i);
+    auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       if (code->calc_bit(j)) {
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 3add06cb63..c30bb52641 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -132,13 +132,15 @@ class SimpleCode : public Code {
   size_t c_;
 };
 
-template <typename R>
+template <typename T>
 class CustomCode : public Code {
  public:
-  CustomCode(const framework::LoDTensor* ptable,
-             const framework::LoDTensor* pcode, const int64_t* ids,
-             const int index)
-      : ptable_(ptable), pcode_(pcode), ids_(ids), index_(index) {}
+  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
+             const int64_t* ids, int index)
+      : ids_(ids), index_(index) {
+    ptable_ = ptable.Slice(index, index + 1);
+    pcode_ = pcode.Slice(index, index + 1);
+  }
   /**
    * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
@@ -148,20 +150,13 @@ class CustomCode : public Code {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const {
-    return ptable_
-        ->data<R>()[index_ * static_cast<int>(ptable_->dims()[1]) + bit];
-  }
-  bool calc_bit(int bit) const {
-    return pcode_
-        ->data<R>()[index_ * static_cast<int>(ptable_->dims()[1]) + bit];
-  }
+  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
+  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
   int get_length() const {
     int length = 0;
 
-    for (int i = 0; i < static_cast<int>(ptable_->dims()[1]); i++) {
-      if (ptable_->data<R>()[index_ * static_cast<int>(ptable_->dims()[1]) +
-                             i] >= 0) {
+    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
+      if (ptable_.data<T>()[i] >= 0) {
         length++;
       } else {
         return length;
@@ -171,15 +166,15 @@ class CustomCode : public Code {
   }
 
  private:
-  const framework::LoDTensor* ptable_;
-  const framework::LoDTensor* pcode_;
+  framework::Tensor ptable_;
+  framework::Tensor pcode_;
   const int64_t* ids_;
   const int index_;
 };
 
 class SimpleCodeTable : public CodeTable {
  public:
-  explicit SimpleCodeTable(size_t num_classes, const int64_t* ids)
+  SimpleCodeTable(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes), ids_(ids) {}
   std::unique_ptr<Code> get_code(int64_t code) const {
     std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
@@ -193,97 +188,92 @@ class SimpleCodeTable : public CodeTable {
   const int64_t* ids_;
 };
 
-template <typename R>
+template <typename T>
 class CustomCodeTable : public CodeTable {
  public:
-  explicit CustomCodeTable(const framework::LoDTensor* ptable,
-                           const framework::LoDTensor* pcode,
-                           const int64_t* ids)
+  CustomCodeTable(const framework::Tensor& ptable,
+                  const framework::Tensor& pcode, const int64_t* ids)
       : ptable_(ptable), pcode_(pcode), ids_(ids) {}
 
   std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new CustomCode<R>(ptable_, pcode_, ids_, code));
+    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
     return coder;
   }
 
-  size_t size() const { return static_cast<size_t>(ptable_->dims()[1]); }
+  size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
   int get_max_code_length() const {
-    return static_cast<size_t>(ptable_->dims()[1]);
+    return static_cast<size_t>(ptable_.dims()[1]);
   }
 
  private:
-  const framework::LoDTensor* ptable_;
-  const framework::LoDTensor* pcode_;
+  const framework::Tensor& ptable_;
+  const framework::Tensor& pcode_;
   const int64_t* ids_;
 };
 
 template <typename T>
 class MatrixBitCodeFunctor {
  public:
-  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+  MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes),
         ids_(ids),
-        code_table(new SimpleCodeTable(num_classes, ids)) {}
+        code_table_(new SimpleCodeTable(num_classes, ids)) {}
 
-  explicit MatrixBitCodeFunctor(const framework::LoDTensor* ptable,
-                                const framework::LoDTensor* pcode,
-                                const int64_t* ids)
-      : num_classes_(static_cast<size_t>(ptable->dims()[1])),
+  MatrixBitCodeFunctor(const framework::Tensor& ptable,
+                       const framework::Tensor& pcode, const int64_t* ids)
+      : num_classes_(static_cast<size_t>(ptable.dims()[1])),
         ids_(ids),
-        code_table(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
-  void Add(const framework::LoDTensor& vec, framework::LoDTensor* tmat);
+  void Add(const framework::Tensor& vec, framework::Tensor* tmat);
 
   /* For j < code_length
        vec(0, index(i, j)) += tmat(i, j)
   */
-  void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec);
+  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
 
   /* For selected rows For j < code_length
        vec(0, index(i, j)) += tmat(i, j)
   */
-  void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec);
+  void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
 
   /* For j < code_length
     sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
   */
-  void Sum(const framework::LoDTensor& tmat, framework::LoDTensor* sum,
-           T scale_sum);
+  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
 
   /* For j < code_length
        tmat(i, j) -= bit(i, j)
   */
-  void Sub(framework::LoDTensor* tmat);
+  void Sub(framework::Tensor* tmat);
   /* For j < code_length
        input.row(i) += tmat(i, j) * weight.row(index(i, j))
   */
-  void Mul(framework::LoDTensor* tmat, const framework::LoDTensor& weight,
-           const framework::LoDTensor& input);
+  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
+           const framework::Tensor& input);
 
   /* For index(i, j) >= 0:
       weight.row(index(i, j)) += tmat(i, j) * input.row(i)
   */
-  void MulGradWeight(const framework::LoDTensor& tmat,
-                     framework::LoDTensor* weight,
-                     const framework::LoDTensor& input);
+  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
+                     const framework::Tensor& input);
   /* For SelectedRows Weight, For index(i, j) >= 0:
       weight.row(index(i, j)) += tmat(i, j) * input.row(i)
   */
-  void MulGradWeight(const framework::LoDTensor& tmat,
+  void MulGradWeight(const framework::Tensor& tmat,
                      framework::SelectedRows* weight,
-                     const framework::LoDTensor& input);
+                     const framework::Tensor& input);
   /* For j < code_length
     input.row(i) += tmat(i, j) * weight.row(index(i, j))
   */
-  void MulGradError(const framework::LoDTensor& tmat,
-                    const framework::LoDTensor& weight,
-                    framework::LoDTensor* input);
+  void MulGradError(const framework::Tensor& tmat,
+                    const framework::Tensor& weight, framework::Tensor* input);
 
   size_t num_classes_;
   const int64_t* ids_;
-  std::unique_ptr<CodeTable> code_table;
+  std::unique_ptr<CodeTable> code_table_;
 };
 }  // namespace math
 }  // namespace operators
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8e7cff8056..fd02b445e7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4639,7 +4639,7 @@ def hsigmoid(input,
         "X": input,
         "W": weights,
         "PTable": ptable,
-        "PCode": pcode,
+        "PathCode": pcode,
         "Label": label
     }
     if helper.bias_attr:
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 955fc51d57..8152ce9b78 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest):
             'X': x,
             'W': w,
             'PTable': ptable,
-            'PCode': pcode,
+            'PathCode': pcode,
             'Label': label,
             'Bias': bias
         }
@@ -285,7 +285,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
             'X': x,
             'W': w,
             'PTable': ptable,
-            'PCode': pcode,
+            'PathCode': pcode,
             'Label': label,
             'Bias': bias
         }
@@ -322,7 +322,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
             'X': x,
             'W': w,
             'PTable': ptable,
-            'PCode': pcode,
+            'PathCode': pcode,
             'Label': label,
         }
         pre_output, out = hsigmoidWithCustomTree(
-- 
GitLab