未验证 提交 30b66f03 编写于 作者: Z zhaoyingli 提交者: GitHub

fix conflict (#44891)

上级 247002ec
...@@ -26,6 +26,7 @@ from .dist_attribute import get_op_dist_attr_field_keys ...@@ -26,6 +26,7 @@ from .dist_attribute import get_op_dist_attr_field_keys
class DistributedOperator: class DistributedOperator:
def __init__(self, serial_op, dist_attr=None): def __init__(self, serial_op, dist_attr=None):
self._serial_op = serial_op self._serial_op = serial_op
self._serial_inputs = {} self._serial_inputs = {}
...@@ -248,6 +249,7 @@ class DistributedOperator: ...@@ -248,6 +249,7 @@ class DistributedOperator:
class DistributedModule: class DistributedModule:
def __init__(self, serial_module, dist_attr=None): def __init__(self, serial_module, dist_attr=None):
self._serial_module = serial_module self._serial_module = serial_module
self._dist_attr = dist_attr self._dist_attr = dist_attr
...@@ -265,6 +267,4 @@ class DistributedModule: ...@@ -265,6 +267,4 @@ class DistributedModule:
dist_op = DistributedOperator(op, self._dist_attr) dist_op = DistributedOperator(op, self._dist_attr)
dist_op.dist_attr.mark_annotated_as(self._dist_attr) dist_op.dist_attr.mark_annotated_as(self._dist_attr)
default_dist_ctx.add_dist_op_for_program(dist_op) default_dist_ctx.add_dist_op_for_program(dist_op)
if isinstance(output, Variable): return output
output = [output]
return list(output)
...@@ -47,6 +47,7 @@ paddle.seed(44) ...@@ -47,6 +47,7 @@ paddle.seed(44)
class MyDataset(Dataset): class MyDataset(Dataset):
def __init__(self, num_samples): def __init__(self, num_samples):
super(MyDataset, self).__init__() super(MyDataset, self).__init__()
self.num_samples = num_samples self.num_samples = num_samples
...@@ -61,6 +62,7 @@ class MyDataset(Dataset): ...@@ -61,6 +62,7 @@ class MyDataset(Dataset):
class MLPLayer(nn.Layer): class MLPLayer(nn.Layer):
def __init__(self, def __init__(self,
hidden_size=1024, hidden_size=1024,
intermediate_size=4 * 1024, intermediate_size=4 * 1024,
...@@ -69,39 +71,41 @@ class MLPLayer(nn.Layer): ...@@ -69,39 +71,41 @@ class MLPLayer(nn.Layer):
super(MLPLayer, self).__init__() super(MLPLayer, self).__init__()
d_model = hidden_size d_model = hidden_size
dim_feedforward = intermediate_size dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( weight_attr = paddle.ParamAttr(
mean=0.0, std=initializer_range)) initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
bias_attr = None bias_attr = None
self.linear0 = nn.Linear( self.linear0 = nn.Linear(d_model,
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) dim_feedforward,
self.linear1 = nn.Linear( weight_attr,
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) bias_attr=bias_attr)
self.linear1 = nn.Linear(dim_feedforward,
d_model,
weight_attr,
bias_attr=bias_attr)
self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5) self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
def forward(self, input): def forward(self, input):
out = auto.shard_op( out = auto.shard_op(self.norm, dist_attr={"process_mesh":
self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0] PP_MESH_0})(input)
out = self.linear0(input) out = self.linear0(out)
out = F.gelu(out, approximate=True) out = F.gelu(out, approximate=True)
out = auto.shard_op( out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0] PP_MESH_1})(out)
out = self.dropout(out) out = self.dropout(out)
out = self.linear2(out) out = self.linear2(out)
return out return out
def train(): def train():
mlp = MLPLayer( mlp = MLPLayer(hidden_size=hidden_size,
hidden_size=hidden_size,
intermediate_size=4 * hidden_size, intermediate_size=4 * hidden_size,
dropout_ratio=0.1, dropout_ratio=0.1,
initializer_range=0.02) initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss() loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.fluid.optimizer.AdamOptimizer( optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
learning_rate=0.00001,
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-08, epsilon=1e-08,
...@@ -119,8 +123,7 @@ def train(): ...@@ -119,8 +123,7 @@ def train():
dist_strategy.semi_auto = True dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy) fleet.init(is_collective=True, strategy=dist_strategy)
engine = Engine( engine = Engine(mlp,
mlp,
inputs_spec=inputs_spec, inputs_spec=inputs_spec,
labels_spec=labels_spec, labels_spec=labels_spec,
strategy=dist_strategy) strategy=dist_strategy)
......
...@@ -38,6 +38,7 @@ PP_MESH_1 = auto.ProcessMesh([2, 3]) ...@@ -38,6 +38,7 @@ PP_MESH_1 = auto.ProcessMesh([2, 3])
class MLPLayer(nn.Layer): class MLPLayer(nn.Layer):
def __init__(self, def __init__(self,
hidden_size=1024, hidden_size=1024,
intermediate_size=4 * 1024, intermediate_size=4 * 1024,
...@@ -45,42 +46,51 @@ class MLPLayer(nn.Layer): ...@@ -45,42 +46,51 @@ class MLPLayer(nn.Layer):
super(MLPLayer, self).__init__() super(MLPLayer, self).__init__()
d_model = hidden_size d_model = hidden_size
dim_feedforward = intermediate_size dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( weight_attr = paddle.ParamAttr(
mean=0.0, std=initializer_range)) initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
bias_attr = None bias_attr = None
self.word_embeddings = nn.Embedding( self.word_embeddings = nn.Embedding(
hidden_size, hidden_size,
hidden_size, hidden_size,
weight_attr=paddle.ParamAttr( weight_attr=paddle.ParamAttr(name="word_embeddings",
name="word_embeddings",
initializer=nn.initializer.Normal( initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))) mean=0.0, std=initializer_range)))
self.linear0 = nn.Linear( self.linear0 = nn.Linear(d_model,
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) dim_feedforward,
self.linear1 = nn.Linear( weight_attr,
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) bias_attr=bias_attr)
self.linear2 = nn.Linear( self.linear1 = nn.Linear(dim_feedforward,
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) d_model,
weight_attr,
bias_attr=bias_attr)
self.linear2 = nn.Linear(dim_feedforward,
d_model,
weight_attr,
bias_attr=bias_attr)
def forward(self, input): def forward(self, input):
auto.shard_tensor( auto.shard_tensor(self.word_embeddings.weight,
self.word_embeddings.weight, dist_attr={
dist_attr={"process_mesh": PP_MESH_0, "process_mesh": PP_MESH_0,
"dims_mapping": [0, -1]}) "dims_mapping": [0, -1]
auto.shard_tensor( })
self.linear0.weight, auto.shard_tensor(self.linear0.weight,
dist_attr={"process_mesh": PP_MESH_0, dist_attr={
"dims_mapping": [-1, 0]}) "process_mesh": PP_MESH_0,
auto.shard_tensor( "dims_mapping": [-1, 0]
self.linear1.weight, })
dist_attr={"process_mesh": PP_MESH_1, auto.shard_tensor(self.linear1.weight,
"dims_mapping": [0, -1]}) dist_attr={
auto.shard_tensor( "process_mesh": PP_MESH_1,
self.linear2.weight, "dims_mapping": [0, -1]
dist_attr={"process_mesh": PP_MESH_1, })
"dims_mapping": [0, -1]}) auto.shard_tensor(self.linear2.weight,
dist_attr={
"process_mesh": PP_MESH_1,
"dims_mapping": [0, -1]
})
w_out = self.word_embeddings(input) w_out = self.word_embeddings(input)
out = self.linear0(w_out) out = self.linear0(w_out)
gelu_out = F.gelu(out, approximate=True) gelu_out = F.gelu(out, approximate=True)
...@@ -98,19 +108,22 @@ def mlp_forward(train_program, start_program): ...@@ -98,19 +108,22 @@ def mlp_forward(train_program, start_program):
hidden_size = 1024 hidden_size = 1024
sequence_len = 512 sequence_len = 512
input = static.data(name="input", shape=[batch_size], dtype='int32') input = static.data(name="input", shape=[batch_size], dtype='int32')
label = static.data( label = static.data(name="label",
name="label", shape=[batch_size, 1], dtype='float32') shape=[batch_size, 1],
dtype='float32')
auto.shard_tensor(
input, dist_attr={"process_mesh": PP_MESH_0, auto.shard_tensor(input,
"dims_mapping": [-1]}) dist_attr={
auto.shard_tensor( "process_mesh": PP_MESH_0,
label, "dims_mapping": [-1]
dist_attr={"process_mesh": PP_MESH_1, })
"dims_mapping": [-1, -1]}) auto.shard_tensor(label,
dist_attr={
mlp = MLPLayer( "process_mesh": PP_MESH_1,
hidden_size=hidden_size, "dims_mapping": [-1, -1]
})
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size, intermediate_size=4 * hidden_size,
initializer_range=0.02) initializer_range=0.02)
...@@ -137,8 +150,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): ...@@ -137,8 +150,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
complete_train_program = completer.complete_forward_annotation( complete_train_program = completer.complete_forward_annotation(
train_program) train_program)
dist_context.block_state.parse_forward_blocks(complete_train_program) dist_context.block_state.parse_forward_blocks(complete_train_program)
params_grads = parallelizer._generate_backward( params_grads = parallelizer._generate_backward(complete_train_program,
complete_train_program,
startup_program, startup_program,
loss, loss,
parameter_list=None, parameter_list=None,
...@@ -171,8 +183,7 @@ def check_send_recv_result(dist_main_prog, rank_id): ...@@ -171,8 +183,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[ if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[
0]: 0]:
send_result = True send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[0]:
0]:
recv_result = True recv_result = True
return send_result and recv_result return send_result and recv_result
...@@ -206,6 +217,7 @@ def check_allgather(dist_main_program): ...@@ -206,6 +217,7 @@ def check_allgather(dist_main_program):
class TestMLPReshard(unittest.TestCase): class TestMLPReshard(unittest.TestCase):
def test_mlp_mppp(self): def test_mlp_mppp(self):
train_program = paddle.static.Program() train_program = paddle.static.Program()
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
...@@ -230,29 +242,20 @@ class TestMLPReshard(unittest.TestCase): ...@@ -230,29 +242,20 @@ class TestMLPReshard(unittest.TestCase):
process_mesh = auto.ProcessMesh(mesh=[0, 3]) process_mesh = auto.ProcessMesh(mesh=[0, 3])
with static.program_guard(train_program, startup_program): with static.program_guard(train_program, startup_program):
x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
x = auto.shard_tensor( x = auto.shard_tensor(x,
x,
dist_attr={ dist_attr={
"process_mesh": process_mesh, "process_mesh": process_mesh,
"dims_mapping": [0, -1] "dims_mapping": [0, -1]
}) })
w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
w = auto.shard_tensor( w = auto.shard_tensor(w,
w,
dist_attr={ dist_attr={
"process_mesh": process_mesh, "process_mesh": process_mesh,
"dims_mapping": [-1, -1] "dims_mapping": [-1, -1]
}) })
# y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { y = paddle.distributed.shard_op(paddle.matmul,
# x.name: [-1, -1],
# w.name: [-1, -1]
# }, **{"x": x,
# "y": w})[0]
y = paddle.distributed.shard_op(
paddle.matmul,
dist_attr={ dist_attr={
"process_mesh": process_mesh, "process_mesh": process_mesh,
x: { x: {
...@@ -261,7 +264,7 @@ class TestMLPReshard(unittest.TestCase): ...@@ -261,7 +264,7 @@ class TestMLPReshard(unittest.TestCase):
w: { w: {
"dims_mapping": [-1, -1] "dims_mapping": [-1, -1]
} }
})(x, w)[0] })(x, w)
rank_id = 0 rank_id = 0
dist_context = DistributedContext() dist_context = DistributedContext()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册