未验证 提交 18615626 编写于 作者: 武毅 提交者: GitHub

Merge pull request #7715 from Yancey1989/split_selected_rows_to_multi_pserver

[WIP] Split SelectedRows to multiple pservers
...@@ -103,7 +103,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -103,7 +103,7 @@ class RecvOp : public framework::OperatorBase {
// TODO(typhoonzero): change this to a while_op for every cluster-batch. // TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false; bool exit_flag = false;
int64_t barrier_size = param_count * fan_in; size_t barrier_size = param_count * fan_in;
while (!exit_flag) { while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
......
...@@ -23,8 +23,6 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -23,8 +23,6 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input SelectedRows."); AddInput("X", "The input SelectedRows.");
AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable(); AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
AddAttr<std::vector<int>>("rows_sections", "Rows section for output.")
.SetDefault(std::vector<int>({}));
AddAttr<std::vector<int>>("height_sections", AddAttr<std::vector<int>>("height_sections",
"Height for each output SelectedRows.") "Height for each output SelectedRows.")
.SetDefault(std::vector<int>({})); .SetDefault(std::vector<int>({}));
...@@ -35,16 +33,16 @@ height_sections is only needed when need to split the dims of the original tenso ...@@ -35,16 +33,16 @@ height_sections is only needed when need to split the dims of the original tenso
Example: Example:
Input: Input:
X.rows = {0, 7, 5} X.rows = {7, 5}
X.height = 12 X.height = 12
Attr: Attr:
rows_sections = {1, 2} height_sections = {4, 8}
height_sections = {}
Out: Out:
out0.rows = {0} out0.rows = {}
out0.height = 12 out0.height = 4
out1.rows = {7, 5}
out2.height = 12 out1.rows = {5, 7}
out2.height = 8
)DOC"); )DOC");
} }
...@@ -61,11 +59,6 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel { ...@@ -61,11 +59,6 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
std::vector<int> height_sections = std::vector<int> height_sections =
ctx->Attrs().Get<std::vector<int>>("height_sections"); ctx->Attrs().Get<std::vector<int>>("height_sections");
std::vector<int> rows_sections =
ctx->Attrs().Get<std::vector<int>>("rows_sections");
PADDLE_ENFORCE_EQ(
rows_sections.size(), ctx->Outputs("Out").size(),
"The size of rows section should be the same with Outputs size.");
int64_t n = ctx->Outputs("Out").size(); int64_t n = ctx->Outputs("Out").size();
std::vector<framework::DDim> outs_dims; std::vector<framework::DDim> outs_dims;
......
...@@ -16,40 +16,70 @@ limitations under the License. */ ...@@ -16,40 +16,70 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static int FindOutIdx(int row, const std::vector<int>& height_sections) {
int offset = 0;
for (size_t i = 0; i < height_sections.size(); ++i) {
if (row >= offset && row < (offset + height_sections[i])) {
return i;
}
offset += height_sections[i];
}
return -1;
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SplitSelectedRowsOpKernel : public framework::OpKernel<T> { class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::SelectedRows>("X"); auto* x = ctx.Input<framework::SelectedRows>("X");
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out"); auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
auto rows_sections = ctx.Attr<std::vector<int>>("rows_sections");
auto height_sections = ctx.Attr<std::vector<int>>("height_sections"); auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
int64_t n = outs.size(); auto x_rows = x->rows();
int offset = 0; std::vector<std::vector<int>> outs_rows_idx;
outs_rows_idx.resize(outs.size());
for (int64_t i = 0; i < n; ++i) { auto row_numel = x->value().numel() / x->value().dims()[0];
framework::Vector<int64_t> out_rows; auto src = x->value().data<T>();
for (int64_t j = 0; j < rows_sections[i]; ++j) {
out_rows.push_back(x->rows()[offset + j]); for (size_t i = 0; i < x_rows.size(); ++i) {
} int out_idx = FindOutIdx(x_rows[i], height_sections);
outs_rows_idx[out_idx].push_back(i);
}
auto place = ctx.GetPlace();
auto& out = outs[i]; for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
auto x_dims = x->GetCompleteDims(); auto rows_idx = outs_rows_idx[i];
x_dims[0] = rows_sections[i]; if (rows_idx.size() > 0) {
out->mutable_value()->mutable_data<T>(x_dims, ctx.GetPlace()); auto dims = x->GetCompleteDims();
framework::Copy(x->value().Slice(offset, rows_sections[i] + offset), dims[0] = rows_idx.size();
x->place(), ctx.device_context(), out->mutable_value()); outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
outs[i]->set_rows(out_rows); for (auto idx : rows_idx) {
if (height_sections.size()) { outs[i]->mutable_rows()->push_back(x_rows[idx]);
outs[i]->set_height(height_sections[i]); }
auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
for (size_t j = 0; j < rows_idx.size(); j++) {
if (platform::is_cpu_place(place)) {
memory::Copy(platform::CPUPlace(), dst + j * row_numel,
platform::CPUPlace(), src + rows_idx[j] * row_numel,
sizeof(T) * row_numel);
} else {
#ifdef PADDLE_WITH_CUDA
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
platform::CUDAPlace(), src + rows_idx[j] * row_numel,
sizeof(T) * row_numel, stream);
#else
PADDLE_THROW("Paddle is not compiled with GPU");
#endif
}
}
} }
offset += rows_sections[i];
} }
} }
}; };
......
...@@ -19,6 +19,7 @@ import optimizer ...@@ -19,6 +19,7 @@ import optimizer
from layer_helper import LayerHelper from layer_helper import LayerHelper
from distributed_spliter import * from distributed_spliter import *
import math import math
from . import core
class VarBlock: class VarBlock:
...@@ -217,15 +218,28 @@ class DistributeTranspiler: ...@@ -217,15 +218,28 @@ class DistributeTranspiler:
if len(splited_vars) <= 1: if len(splited_vars) <= 1:
continue continue
orig_var = program.global_block().vars[varname] orig_var = program.global_block().vars[varname]
sections = [] if orig_var == core.VarDesc.VarType.SELECTED_ROWS:
for v in splited_vars: height_sections = []
sections.append(v.shape[0]) for v in splited_vars:
program.global_block().append_op( height_sections.append(v.shape[0])
type="split", program.global_block().append_op(
inputs={"X": orig_var}, type="split_selected_rows",
outputs={"Out": splited_vars}, inputs={"X": orig_var},
attrs={"sections": sections} # assume split evenly outputs={"Out": splited_vars},
) attrs={"height_sections": height_sections})
elif orig_var == core.VarDesc.VarType.LOD_TENSOR:
sections = []
for v in splited_vars:
sections.append(v.shape[0])
program.global_block().append_op(
type="split",
inputs={"X": orig_var},
outputs={"Out": splited_vars},
attrs={"sections": sections} # assume split evenly
)
else:
AssertionError("Variable type should be in set "
"[LOD_TENSOR, SELECTED_ROWS]")
return var_mapping return var_mapping
def get_trainer_program(self): def get_trainer_program(self):
......
...@@ -35,8 +35,8 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -35,8 +35,8 @@ class TestSpliteSelectedRows(unittest.TestCase):
def check_with_place(self, place): def check_with_place(self, place):
scope = core.Scope() scope = core.Scope()
rows = [0, 5, 7, 4] rows = [0, 5, 7, 4, 20]
height = 10 height = 20
row_numel = 2 row_numel = 2
# initialize input variable X # initialize input variable X
...@@ -46,38 +46,41 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -46,38 +46,41 @@ class TestSpliteSelectedRows(unittest.TestCase):
np_array = np.ones((len(rows), row_numel)).astype("float32") np_array = np.ones((len(rows), row_numel)).astype("float32")
np_array[0, 0] = 2.0 np_array[0, 0] = 2.0
np_array[2, 1] = 4.0 np_array[2, 1] = 4.0
np_array[4, 1] = 8.0
x_tensor = x.get_tensor() x_tensor = x.get_tensor()
x_tensor.set(np_array, place) x_tensor.set(np_array, place)
rows_sections = [2, 2] height_sections = [5, 5, 5, 5, 3]
height_sections = []
# initialize output variables [out0, out1] # initialize output variables [out0, out1]
out0 = scope.var('out0').get_selected_rows() outs_name = ["out%d" % i for i in xrange(len(height_sections))]
out1 = scope.var('out1').get_selected_rows() outs = [
scope.var(var_name).get_selected_rows() for var_name in outs_name
]
# expected output selected rows # expected output selected rows
expected_out0_rows = [0, 5] expected_out0_rows = [0, 4]
expected_out1_rows = [7, 4] expected_out1_rows = [5, 7]
expected_height = height expected_out4_rows = [20]
op = Operator( op = Operator(
"split_selected_rows", "split_selected_rows",
X="X", X="X",
Out=["out0", "out1"], Out=outs_name,
rows_sections=rows_sections,
height_sections=height_sections) height_sections=height_sections)
op.run(scope, place) op.run(scope, place)
self.assertEqual(out0.rows(), expected_out0_rows) self.assertEqual(outs[0].rows(), expected_out0_rows)
self.assertEqual(out1.rows(), expected_out1_rows) self.assertEqual(outs[1].rows(), expected_out1_rows)
self.assertEqual(outs[4].rows(), expected_out4_rows)
self.assertEqual(out0.height(), expected_height) self.assertEqual(outs[0].height(), height_sections[0])
self.assertEqual(out1.height(), expected_height) self.assertEqual(outs[4].height(), height_sections[4])
self.assertAlmostEqual(2.0, np.array(out0.get_tensor())[0, 0]) self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
self.assertAlmostEqual(4.0, np.array(out1.get_tensor())[0, 1]) self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
def check_grad_with_place(self, place): def check_grad_with_place(self, place):
scope = core.Scope() scope = core.Scope()
...@@ -85,8 +88,7 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -85,8 +88,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
row_numel = 2 row_numel = 2
# attr # attr
rows_sections = [2, 2] height_sections = [5, 5]
height_sections = []
# initialize input variable X # initialize input variable X
out0_grad = scope.var("out0@GRAD").get_selected_rows() out0_grad = scope.var("out0@GRAD").get_selected_rows()
...@@ -113,7 +115,6 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -113,7 +115,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
"sum", "sum",
X=["out0@GRAD", "out1@GRAD"], X=["out0@GRAD", "out1@GRAD"],
Out="X@GRAD", Out="X@GRAD",
rows_sections=rows_sections,
height_sections=height_sections) height_sections=height_sections)
grad_op.run(scope, place) grad_op.run(scope, place)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册