Merge pull request #14051 from velconia/accelerate_embedding_grad

[1.1] Accelerate sparse embedding grad op in CPU device

Merge pull request #14051 from velconia/accelerate_embedding_grad
[1.1] Accelerate sparse embedding grad op in CPU device
209f24a2 · Qiyang Min · GitHub · 9da9b192 · a8b17537 · 209f24a2
3 changed file
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
        .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Lookup Table Operator.


--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
 namespace operators {
@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
      const auto *table = table_t.value().data<T>();
      auto *output = output_t->mutable_data<T>(context.GetPlace());

+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
      for (int64_t i = 0; i < ids_numel; ++i) {
        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
          memset(output + i * row_width, 0, row_width * sizeof(T));
@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
          PADDLE_ENFORCE_GE(ids[i], 0);
          auto id_index = table_t.Index(ids[i]);
          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          memcpy(output + i * row_width, table + id_index * row_width,
-                 row_width * sizeof(T));
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
        }
      }
    }
@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();

-      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_num);
-      for (int64_t i = 0; i < ids_num; i++) {
-        new_rows.push_back(ids_data[i]);
-      }
+      std::vector<int64_t> new_rows;
+      new_rows.resize(ids_num);
+      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();
      d_table_value->Resize({ids_num, table_dim[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table_value->data<T>();
-
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      // FIXME(minqiyang):
+      // memory optimization will NOT reuse Tensor with SelectedRows
+      // so we could just share the tensor here directly.
+      // However, the InferVarType method will infer the output SelectedRows
+      // to Tensor sometimes, which is a bug, so we will add an attribute
+      // here to indicate the inplace and remove this attribute after
+      // the InferVarType's bug was fixed
+      bool grad_inplace = context.Attr<bool>("grad_inplace");
+      if (grad_inplace) {
+        d_table_value->ShareDataWith(*d_output);
+      } else {
+        d_table_value->mutable_data<T>(context.GetPlace());
+
+        d_table->set_height(table_dim[0]);
+
+        auto *d_output_data = d_output->data<T>();
+        auto *d_table_data = d_table_value->data<T>();
+
+        auto d_output_dims = d_output->dims();
+        PADDLE_ENFORCE_EQ(
+            d_table_value->dims(),
+            framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
+        memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      }
    } else {
      auto *ids = context.Input<LoDTensor>("Ids");
      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word,
            name=pos_enc_param_name,
            trainable=False,
            initializer=fluid.initializer.ConstantInitializer(0.001)))
+    src_pos_enc.stop_gradient = True
    enc_input = src_word_emb + src_pos_enc
    return layers.dropout(
        enc_input,