diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 56fdf542bbb93ec28c0dc21bacf38eedb3968bd0..6a8720c1cc27de41a91b40c29ae9d08b99ccb09e 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -78,9 +78,9 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, if (buffer_tensor.is_dense_tensor()) { paddle::imperative::SelectedRowsAddToTensor(t, &buffer_tensor); } else { - PADDLE_THROW(paddle::platform::errors::Fatal( - "We don't support Selected Rows merge for now, support it later " - "and make all kinds of grads can be merged.")); + buffer_tensor = + std::move(*paddle::imperative::SelectedRowsMerge< + paddle::experimental::Tensor>(t, buffer_tensor)); } } } diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index b771ff28d8ee2d762f5bca717942d4a57c155984..734a611d07b57b6e8e31933cf2683e60efff487a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/selected_rows.h" #include "paddle/pten/core/kernel_registry.h" @@ -102,3 +103,69 @@ TEST(GradTensorHolder, Interfaces) { CHECK_EQ(holder_et0_ptr[0], 1.0f); CHECK_EQ(holder_et1_ptr[0], 30.0f); } + +TEST(GradTensorHolder, SelectedRowsMergeAdd) { + pten::CPUPlace cpu; + + std::vector rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int64_t table_size = 10; + int64_t embedding_width = 10; + + auto sr1 = std::make_shared(rows, table_size); + auto sr2 = std::make_shared(rows, table_size); + + // initialize a sparse table 1 + sr1->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr1 = sr1->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr1[i * embedding_width + j] = static_cast(i); + } + } + + // initialize a sparse table 2 + sr2->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr2 = sr2->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr2[i * embedding_width + j] = static_cast(i); + } + } + // new 2 pten::Tensor + paddle::experimental::Tensor t1(sr1); + paddle::experimental::Tensor t2(sr2); + + // Constructor empty GradTensorHolder + GradSlotMeta slot_meta; + slot_meta.Init(1); + GradTensorHolder grad_tensor_holder = + GradTensorHolder({slot_meta, slot_meta}); + + // accumulation + grad_tensor_holder.add(0, 0, t1, false); + grad_tensor_holder.add(0, 0, t2, false); + + // Buffers() + const auto& buffers = grad_tensor_holder.Buffers(); + CHECK_EQ(static_cast(buffers.size()), 2); + CHECK_EQ(static_cast(buffers[0].size()), 1); + CHECK_EQ(static_cast(buffers[1].size()), 1); + + // operator[] + const auto& holder_et0 = grad_tensor_holder[0][0]; + + auto* tmp_buffer_tensor = + static_cast(holder_et0.impl().get()); + auto* tmp_buffer_data_sr = + tmp_buffer_tensor->mutable_value()->mutable_data(cpu); + + // verify the MergeAdd result (accumulation result) + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + EXPECT_EQ(tmp_buffer_data_sr[i * embedding_width + j], + (static_cast(i) + static_cast(i))); + } + } +} diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 89d9324039c15cecd8ba1518aae3645e2f540f9d..90cf0e76e000736f730121a6fcce841aa38a59ae 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -44,9 +44,9 @@ if(WITH_GLOO) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index d57cb696526b490fac9d2610320ede8eef665d4f..4a71dd4deac9c5e69b1b6234a93fbb12bc1f31a5 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -243,6 +243,23 @@ TType& GetInnerTensor(const paddle::experimental::Tensor& src) { return *src_tensor; } +template +TType* GetEmptyInnerTensor(paddle::experimental::Tensor* dst) { + PADDLE_ENFORCE_EQ( + dst->defined(), false, + platform::errors::Fatal( + "The underlying Tensor implementation should be nullptr")); + dst->set_impl(std::make_shared()); + auto* dst_tensor = static_cast(dst->impl().get()); + return dst_tensor; +} + +template +TType* GetEmptyInnerTensor(paddle::imperative::VariableWrapper* dst) { + auto* dst_tensor = dst->MutableVar()->GetMutable(); + return dst_tensor; +} + template void TensorAdd(const VarType& src, VarType* dst) { pten::DenseTensor* dst_tensor = GetInnerMutableTensor(dst); @@ -473,13 +490,14 @@ template void SelectedRowsAddTensor( // Note(chenweihang): when two selected rows need to be added, // adding one to another is not equal to merging two selected rows // to one then add it to a empty selected rows, the after is correct -// Note(chenweihang): when two selected rows need to be added, -// adding one to another is not equal to merging two selected rows -// to one then add it to a empty selected rows, the after is correct -std::shared_ptr SelectedRowsMerge( - const framework::Variable& src1, const framework::Variable& src2) { - auto& src_selected_rows1 = src1.Get(); - auto& src_selected_rows2 = src2.Get(); +template +std::shared_ptr SelectedRowsMerge(const VarType& src1, + const VarType& src2) { + const pten::SelectedRows& src_selected_rows1 = + GetInnerTensor(src1); + const pten::SelectedRows& src_selected_rows2 = + GetInnerTensor(src2); + auto place = src_selected_rows1.value().place(); auto data_type = framework::TransToProtoVarType(src_selected_rows1.value().dtype()); @@ -488,9 +506,10 @@ std::shared_ptr SelectedRowsMerge( std::vector src_selected_rows; src_selected_rows.emplace_back(&src_selected_rows1); src_selected_rows.emplace_back(&src_selected_rows2); - auto dst_var = std::make_shared("Temp"); - auto* dst_selected_rows = - dst_var->MutableVar()->GetMutable(); + + auto dst_var = std::make_shared("Temp"); + pten::SelectedRows* dst_selected_rows = + GetEmptyInnerTensor(dst_var.get()); #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ @@ -515,12 +534,17 @@ std::shared_ptr SelectedRowsMerge( #endif #undef PADDLE_SELECTED_ROWS_ADD - PADDLE_THROW(platform::errors::InvalidArgument( "Not supported data type %s for SelectedRowsMerge", framework::DataTypeToString(data_type))); } +template std::shared_ptr SelectedRowsMerge( + const paddle::experimental::Tensor& src1, + const paddle::experimental::Tensor& src2); +template std::shared_ptr SelectedRowsMerge( + const framework::Variable& src1, const framework::Variable& src2); + void VariableWrapperAdd(std::shared_ptr var, VariableWrapper* dst_var, bool unchange_input) { auto& src = var->Var(); @@ -547,7 +571,7 @@ void VariableWrapperAdd(std::shared_ptr var, *dst = std::move(*(var->MutableVar())); } } else if (src.IsType()) { - auto temp = SelectedRowsMerge(src, *dst); + auto temp = SelectedRowsMerge(src, *dst); *dst = std::move(*(temp->MutableVar())); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -603,7 +627,7 @@ void GradientAccumulator::AccumulateGrad() { SelectedRowsAddToTensor(*dst, src); *dst = std::move(*src); } else if (src->IsType()) { - auto temp = SelectedRowsMerge(*src, *dst); + auto temp = SelectedRowsMerge(*src, *dst); *dst = std::move(*(temp->MutableVar())); } } else { diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 6371f64fe61044d6cc9ea8a10e5dbcacd3d187e4..ee2df582e81ee5cefe1faf9f3700b91c6adae434 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -17,10 +17,10 @@ #include #include #include - #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/pten/api/include/tensor.h" namespace paddle { namespace imperative { @@ -164,6 +164,10 @@ class SortedGradientAccumulator : public GradientAccumulator { std::vector tmp_grad_vars_; }; +template +std::shared_ptr SelectedRowsMerge(const VarType& src1, + const VarType& src2); + template void SelectedRowsAddToTensor(const VarType& src, VarType* dst); diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 56eb47a2ef1719d3aad9eb10a47a46d06d0866d5..774bb9653e2cba5c27f9037ee905e70175375339 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -12,7 +12,7 @@ else() endif(WIN32) -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 584f8ead3d8de40ed296da9e2f99845b9e7e5d3c..4dfc8198064e376edf55df9b4c51031344f71485 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -29,6 +29,57 @@ namespace framework = paddle::framework; namespace paddle { namespace imperative { +TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) { + pten::CPUPlace cpu; + + std::vector rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int64_t table_size = 10; + int64_t embedding_width = 10; + + auto sr1 = std::make_shared(rows, table_size); + auto sr2 = std::make_shared(rows, table_size); + + // initialize a sparse table 1 + sr1->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr1 = sr1->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr1[i * embedding_width + j] = static_cast(i); + } + } + + // initialize a sparse table 2 + sr2->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr2 = sr2->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr2[i * embedding_width + j] = static_cast(i); + } + } + // new 2 pten::Tensor + paddle::experimental::Tensor t1(sr1); + paddle::experimental::Tensor t2(sr2); + + // call SelectedRowsMerge + auto new_buffer = + paddle::imperative::SelectedRowsMerge(t1, + t2); + auto* new_buffer_tensor = + static_cast(new_buffer->impl().get()); + auto* new_buffer_data_sr1 = + new_buffer_tensor->mutable_value()->mutable_data(cpu); + + // verify the MergeAdd result + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + EXPECT_EQ(new_buffer_data_sr1[i * embedding_width + j], + (static_cast(i) + static_cast(i))); + } + } +} + template int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) { framework::Variable var1;