From 133bac2b10de13d11424e52a6bbe935817cde083 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 24 Oct 2018 17:44:02 +0800 Subject: [PATCH] Accelerate embedding op grad test=develop --- paddle/fluid/operators/lookup_table_op.h | 26 ++++++++---------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 58463dc4d6f..eac6224d106 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -68,6 +68,7 @@ class LookupTableKernel : public framework::OpKernel { const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); + auto blas = math::GetBlas(context); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != kNoPadding && ids[i] == padding_idx) { memset(output + i * row_width, 0, row_width * sizeof(T)); @@ -75,8 +76,8 @@ class LookupTableKernel : public framework::OpKernel { PADDLE_ENFORCE_GE(ids[i], 0); auto id_index = table_t.Index(ids[i]); PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - memcpy(output + i * row_width, table + id_index * row_width, - row_width * sizeof(T)); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); } } } @@ -111,27 +112,16 @@ class LookupTableGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); - framework::Vector new_rows; + std::vector new_rows; new_rows.reserve(ids_num); - for (int64_t i = 0; i < ids_num; i++) { - new_rows.push_back(ids_data[i]); - } + std::memcpy(new_rows.data(), ids_data, ids_num * sizeof(int64_t)); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_num, table_dim[1]}); - d_table_value->mutable_data(context.GetPlace()); - - d_table->set_height(table_dim[0]); - - auto *d_output_data = d_output->data(); - auto *d_table_data = d_table_value->data(); - - auto d_output_dims = d_output->dims(); - PADDLE_ENFORCE_EQ( - d_table_value->dims(), - framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); - memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + // memory optimization will NOT reuse Tensor with SelectedRows + // so we could just share the tensor here directly. + d_table_value->ShareDataWith(*d_output); } else { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); -- GitLab