提交 8ec748cf 编写于 作者: M minqiyang

Accelerate SelectedRows Functors:

  1. Accelerate SelectedRows MergeAdd functor

  2. Add SelectedRowsSumTo functor to support MergeAdd multiple SelectedRows into one

test=develop
上级 e1904ac2
...@@ -3,8 +3,8 @@ add_subdirectory(detail) ...@@ -3,8 +3,8 @@ add_subdirectory(detail)
endif(NOT WIN32) endif(NOT WIN32)
function(math_library TARGET) function(math_library TARGET)
# math_library is a function to create math library. # math_library is a function to create math library.
# The interface is the same as cc_library. # The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library. # But it handle split GPU/CPU code and link some common library.
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
...@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) ...@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas) math_library(math_function DEPS blas)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows math_function) math_library(selected_rows_functor DEPS selected_rows math_function blas)
math_library(sequence2batch) math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include <set> #include <set>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
...@@ -150,6 +149,46 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>; ...@@ -150,6 +149,46 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>; template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>; template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
template <typename T>
struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const std::vector<framework::SelectedRows*>& input1,
const std::vector<int64_t>& input2_offsets,
framework::SelectedRows* input2) {
// Ensure all selected rows have the same height
size_t size = 0u;
for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
}
// concat rows
std::vector<int64_t> in2_rows;
in2_rows.reserve(in2_rows.size() + size);
for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
const framework::Vector<int64_t>& in_rows = (*iter)->rows();
in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
}
input2->set_rows(in2_rows);
// start = std::chrono::system_clock::now();
auto* in2_value = input2->mutable_value();
auto* in2_data = in2_value->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
size_t offset = 0u;
for (size_t i = 0u; i != input1.size(); ++i) {
auto& in_value = input1[i]->value();
const auto* in_data = in_value.data<T>();
offset += input2_offsets[i];
blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
}
}
};
template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
template <typename T> template <typename T>
struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> { struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
...@@ -208,8 +247,18 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -208,8 +247,18 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
auto input_rows = input.rows(); auto input_rows = input.rows();
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::vector<int64_t> merge_rows;
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); merge_rows.reserve(input_rows.size());
std::unordered_map<int64_t, size_t> rows_pos_map;
rows_pos_map.reserve(input_rows.size());
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
}
auto input_width = input.value().dims()[1]; auto input_width = input.value().dims()[1];
out.set_rows(merge_rows); out.set_rows(merge_rows);
...@@ -234,8 +283,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -234,8 +283,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
} }
}; };
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext, int>; template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>; template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
......
...@@ -12,8 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#define INLINE_FOR2(sizei, sizej) \ #define INLINE_FOR2(sizei, sizej) \
...@@ -49,6 +54,15 @@ struct SelectedRowsAddTo { ...@@ -49,6 +54,15 @@ struct SelectedRowsAddTo {
const int64_t input2_offset, framework::SelectedRows* input2); const int64_t input2_offset, framework::SelectedRows* input2);
}; };
// input2 = [all input in input1] + input2
template <typename DeviceContext, typename T>
struct SelectedRowsSumTo {
void operator()(const DeviceContext& context,
const std::vector<framework::SelectedRows*>& input1,
const std::vector<int64_t>& input2_offsets,
framework::SelectedRows* input2);
};
// input2 = input1 + input2 // input2 = input1 + input2
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct SelectedRowsAddToTensor { struct SelectedRowsAddToTensor {
...@@ -70,6 +84,108 @@ struct MergeAdd { ...@@ -70,6 +84,108 @@ struct MergeAdd {
framework::SelectedRows* output); framework::SelectedRows* output);
}; };
template <>
struct MergeAdd<platform::CPUDeviceContext, float> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
auto input_rows = input.rows();
std::vector<int64_t> merge_rows;
merge_rows.reserve(input_rows.size());
std::unordered_map<int64_t, size_t> rows_pos_map;
rows_pos_map.reserve(input_rows.size());
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
}
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<float>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, float> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<float>();
auto* input_data = input.value().data<float>();
auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_pos_map[input_rows[i]];
float* y = out_data + out_i * input_width;
const float* x = input_data + i * input_width;
blas.AXPY(input_width, 1., x, y);
}
}
};
template <>
struct MergeAdd<platform::CPUDeviceContext, double> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
auto input_rows = input.rows();
std::vector<int64_t> merge_rows;
merge_rows.reserve(input_rows.size());
std::unordered_map<int64_t, size_t> rows_pos_map;
rows_pos_map.reserve(input_rows.size());
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
}
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<double>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, double> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<double>();
auto* input_data = input.value().data<double>();
auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_pos_map[input_rows[i]];
double* y = out_data + out_i * input_width;
const double* x = input_data + i * input_width;
blas.AXPY(input_width, 1., x, y);
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct Add { struct Add {
framework::SelectedRows operator()(const DeviceContext& context, framework::SelectedRows operator()(const DeviceContext& context,
......
...@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) { ...@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
// row9: 2.0 + 3.0 // row9: 2.0 + 3.0
EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
} }
TEST(selected_rows_functor, cpu_merge_add_float) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows{0, 4, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
new paddle::framework::SelectedRows(rows, height)};
auto* in_value = selected_rows->mutable_value();
in_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows.size()), row_numel}),
cpu_place);
functor(ctx, in_value, 1.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
float>
merge_add_functor;
merge_add_functor(ctx, *selected_rows, output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
auto* out_data = output->value().data<float>();
EXPECT_EQ(out_data[0 * row_numel], 1.0);
EXPECT_EQ(out_data[1 * row_numel], 2.0);
EXPECT_EQ(out_data[2 * row_numel], 1.0);
}
TEST(selected_rows_functor, cpu_merge_add_int) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows{0, 4, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
new paddle::framework::SelectedRows(rows, height)};
auto* in_value = selected_rows->mutable_value();
in_value->mutable_data<int>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows.size()), row_numel}),
cpu_place);
functor(ctx, in_value, 1);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
int>
merge_add_functor;
merge_add_functor(ctx, *selected_rows, output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
auto* out_data = output->value().data<int>();
EXPECT_EQ(out_data[0 * row_numel], 1);
EXPECT_EQ(out_data[1 * row_numel], 2);
EXPECT_EQ(out_data[2 * row_numel], 1);
}
TEST(selected_rows_functor, cpu_sum_to) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
cpu_place);
functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
cpu_place);
functor(ctx, in2_value, 2.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height);
auto* out_value = output->mutable_value();
// simplely concat two SelectedRows
out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
cpu_place);
paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
float>
sum_to_functor;
sum_to_functor(ctx, std::vector<paddle::framework::SelectedRows*>(
{selected_rows1.get(), selected_rows2.get()}),
std::vector<int64_t>({0, in1_value->numel()}), output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
// input1 rows
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
// input2 rows
EXPECT_EQ(out_rows[3], 0);
EXPECT_EQ(out_rows[4], 5);
EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9);
auto* out_data = output->value().data<float>();
// input1 value
EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
// input2 value
EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
std::unique_ptr<paddle::framework::Tensor> tensor1{
new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), cpu_place);
functor(ctx, tensor1.get(), 3.0);
paddle::operators::math::SelectedRowsAddToTensor<
paddle::platform::CPUDeviceContext, float>
add_to_tensor_functor;
add_to_tensor_functor(ctx, *output, tensor1.get());
auto* tensor1_data = tensor1->data<float>();
// row0: 1.0 + 2.0 + 3.0
EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
// row1: 3.0
EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
// row4 : 1.0 + 3.0
EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
// row5: 2.0 + 3.0
EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
// row6: 3.0
EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
// row7: 1.0 + 2.0 + 3.0
EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
// row9: 2.0 + 3.0
EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册