diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 91101356436c26171eaca2fe01dfd4d937e71717..b0276f4080b3989047f9d181e9d37a18dcfad5fa 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -3,8 +3,8 @@ add_subdirectory(detail) endif(NOT WIN32) function(math_library TARGET) - # math_library is a function to create math library. - # The interface is the same as cc_library. + # math_library is a function to create math library. + # The interface is the same as cc_library. # But it handle split GPU/CPU code and link some common library. set(cc_srcs) set(cu_srcs) @@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows math_function) +math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 8e8baf49b2330e95ff1a868b0b0a03bc10d84484..43d593710c2ae7cc422cee7ffc5591681dc34da7 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { @@ -150,6 +149,46 @@ template struct SelectedRowsAddTo; template struct SelectedRowsAddTo; template struct SelectedRowsAddTo; +template +struct SelectedRowsSumTo { + void operator()(const platform::CPUDeviceContext& context, + const std::vector& input1, + const std::vector& input2_offsets, + framework::SelectedRows* input2) { + // Ensure all selected rows have the same height + size_t size = 0u; + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + auto& in_rows = (*iter)->rows(); + size += in_rows.end() - in_rows.begin(); + auto in1_height = (*iter)->height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + } + // concat rows + std::vector in2_rows; + in2_rows.reserve(in2_rows.size() + size); + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + const framework::Vector& in_rows = (*iter)->rows(); + in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end()); + } + input2->set_rows(in2_rows); + + // start = std::chrono::system_clock::now(); + auto* in2_value = input2->mutable_value(); + auto* in2_data = in2_value->data(); + auto blas = math::GetBlas(context); + size_t offset = 0u; + for (size_t i = 0u; i != input1.size(); ++i) { + auto& in_value = input1[i]->value(); + const auto* in_data = in_value.data(); + offset += input2_offsets[i]; + blas.VCOPY(in_value.numel(), in_data, in2_data + offset); + } + } +}; + +template struct SelectedRowsSumTo; +template struct SelectedRowsSumTo; + template struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, @@ -208,8 +247,18 @@ struct MergeAdd { framework::SelectedRows* output) { framework::SelectedRows& out = *output; auto input_rows = input.rows(); - std::set row_set(input_rows.begin(), input_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); + std::vector merge_rows; + merge_rows.reserve(input_rows.size()); + std::unordered_map rows_pos_map; + rows_pos_map.reserve(input_rows.size()); + size_t idx = 0u; + for (std::vector::iterator iter = input_rows.begin(); + iter != input_rows.end(); ++iter) { + if (rows_pos_map.find(*iter) == rows_pos_map.end()) { + rows_pos_map[*iter] = idx++; + merge_rows.emplace_back(*iter); + } + } auto input_width = input.value().dims()[1]; out.set_rows(merge_rows); @@ -234,8 +283,6 @@ struct MergeAdd { } }; -template struct MergeAdd; -template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index aa419f74fcd2a53cdd734ec270bc154b78c9f2ff..3d99c9b3f2303c941a150f77516703b8103c6c25 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -12,8 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" #define INLINE_FOR2(sizei, sizej) \ @@ -49,6 +54,15 @@ struct SelectedRowsAddTo { const int64_t input2_offset, framework::SelectedRows* input2); }; +// input2 = [all input in input1] + input2 +template +struct SelectedRowsSumTo { + void operator()(const DeviceContext& context, + const std::vector& input1, + const std::vector& input2_offsets, + framework::SelectedRows* input2); +}; + // input2 = input1 + input2 template struct SelectedRowsAddToTensor { @@ -70,6 +84,108 @@ struct MergeAdd { framework::SelectedRows* output); }; +template <> +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; + auto input_rows = input.rows(); + std::vector merge_rows; + merge_rows.reserve(input_rows.size()); + std::unordered_map rows_pos_map; + rows_pos_map.reserve(input_rows.size()); + size_t idx = 0u; + for (std::vector::iterator iter = input_rows.begin(); + iter != input_rows.end(); ++iter) { + if (rows_pos_map.find(*iter) == rows_pos_map.end()) { + rows_pos_map[*iter] = idx++; + merge_rows.emplace_back(*iter); + } + } + + auto input_width = input.value().dims()[1]; + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + auto blas = GetBlas(context); + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_pos_map[input_rows[i]]; + float* y = out_data + out_i * input_width; + const float* x = input_data + i * input_width; + blas.AXPY(input_width, 1., x, y); + } + } +}; + +template <> +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; + auto input_rows = input.rows(); + std::vector merge_rows; + merge_rows.reserve(input_rows.size()); + std::unordered_map rows_pos_map; + rows_pos_map.reserve(input_rows.size()); + size_t idx = 0u; + for (std::vector::iterator iter = input_rows.begin(); + iter != input_rows.end(); ++iter) { + if (rows_pos_map.find(*iter) == rows_pos_map.end()) { + rows_pos_map[*iter] = idx++; + merge_rows.emplace_back(*iter); + } + } + + auto input_width = input.value().dims()[1]; + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + auto blas = GetBlas(context); + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_pos_map[input_rows[i]]; + double* y = out_data + out_i * input_width; + const double* x = input_data + i * input_width; + blas.AXPY(input_width, 1., x, y); + } + } +}; + template struct Add { framework::SelectedRows operator()(const DeviceContext& context, diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index 70bed820ee58885861fa8c5535c931f258625572..835589356042b44c9fa5988aed726434fd66910a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) { // row9: 2.0 + 3.0 EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); } + +TEST(selected_rows_functor, cpu_merge_add_float) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows{0, 4, 4, 7}; + std::unique_ptr selected_rows{ + new paddle::framework::SelectedRows(rows, height)}; + auto* in_value = selected_rows->mutable_value(); + in_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows.size()), row_numel}), + cpu_place); + functor(ctx, in_value, 1.0); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + + paddle::operators::math::scatter::MergeAdd + merge_add_functor; + merge_add_functor(ctx, *selected_rows, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + + auto* out_data = output->value().data(); + + EXPECT_EQ(out_data[0 * row_numel], 1.0); + EXPECT_EQ(out_data[1 * row_numel], 2.0); + EXPECT_EQ(out_data[2 * row_numel], 1.0); +} + +TEST(selected_rows_functor, cpu_merge_add_int) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows{0, 4, 4, 7}; + std::unique_ptr selected_rows{ + new paddle::framework::SelectedRows(rows, height)}; + auto* in_value = selected_rows->mutable_value(); + in_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows.size()), row_numel}), + cpu_place); + functor(ctx, in_value, 1); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + + paddle::operators::math::scatter::MergeAdd + merge_add_functor; + merge_add_functor(ctx, *selected_rows, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + + auto* out_data = output->value().data(); + + EXPECT_EQ(out_data[0 * row_numel], 1); + EXPECT_EQ(out_data[1 * row_numel], 2); + EXPECT_EQ(out_data[2 * row_numel], 1); +} +TEST(selected_rows_functor, cpu_sum_to) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + cpu_place); + functor(ctx, in1_value, 1.0); + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + cpu_place); + functor(ctx, in2_value, 2.0); + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + // simplely concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + cpu_place); + paddle::operators::math::SelectedRowsSumTo + sum_to_functor; + sum_to_functor(ctx, std::vector( + {selected_rows1.get(), selected_rows2.get()}), + std::vector({0, in1_value->numel()}), output.get()); + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + auto& out_rows = output->rows(); + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + paddle::operators::math::SelectedRowsAddToTensor< + paddle::platform::CPUDeviceContext, float> + add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + auto* tensor1_data = tensor1->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); +}