未验证 提交 209f24a2 编写于 作者: Q Qiyang Min 提交者: GitHub

Merge pull request #14051 from velconia/accelerate_embedding_grad

[1.1] Accelerate sparse embedding grad op in CPU device
...@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise the given value indicates padding the output " "Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.") "with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding); .SetDefault(kNoPadding);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr<bool>("grad_inplace",
"(boolean, default false) "
"If the grad op reuse the input's variable.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Lookup Table Operator. Lookup Table Operator.
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
const auto *table = table_t.value().data<T>(); const auto *table = table_t.value().data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace()); auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) { if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
...@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0);
auto id_index = table_t.Index(ids[i]); auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
memcpy(output + i * row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
row_width * sizeof(T)); output + i * row_width);
} }
} }
} }
...@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
framework::Vector<int64_t> new_rows; std::vector<int64_t> new_rows;
new_rows.reserve(ids_num); new_rows.resize(ids_num);
for (int64_t i = 0; i < ids_num; i++) { std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
new_rows.push_back(ids_data[i]);
}
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]}); d_table_value->Resize({ids_num, table_dim[1]});
d_table_value->mutable_data<T>(context.GetPlace()); // FIXME(minqiyang):
// memory optimization will NOT reuse Tensor with SelectedRows
d_table->set_height(table_dim[0]); // so we could just share the tensor here directly.
// However, the InferVarType method will infer the output SelectedRows
auto *d_output_data = d_output->data<T>(); // to Tensor sometimes, which is a bug, so we will add an attribute
auto *d_table_data = d_table_value->data<T>(); // here to indicate the inplace and remove this attribute after
// the InferVarType's bug was fixed
auto d_output_dims = d_output->dims(); bool grad_inplace = context.Attr<bool>("grad_inplace");
PADDLE_ENFORCE_EQ( if (grad_inplace) {
d_table_value->dims(), d_table_value->ShareDataWith(*d_output);
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); } else {
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); d_table_value->mutable_data<T>(context.GetPlace());
d_table->set_height(table_dim[0]);
auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table_value->data<T>();
auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
}
} else { } else {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
......
...@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word, ...@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word,
name=pos_enc_param_name, name=pos_enc_param_name,
trainable=False, trainable=False,
initializer=fluid.initializer.ConstantInitializer(0.001))) initializer=fluid.initializer.ConstantInitializer(0.001)))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
return layers.dropout( return layers.dropout(
enc_input, enc_input,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册