未验证 提交 c38b0488 编写于 作者: C caozhou 提交者: GitHub

add reshard module (#35779)

* add reshard module

* fix conflict

* update reshard module

* update and add unitest

* update reshard module and unitest

* add more unitests
上级 7a724ddb
...@@ -19,5 +19,7 @@ from .interface import set_offload_device # noqa: F401 ...@@ -19,5 +19,7 @@ from .interface import set_offload_device # noqa: F401
from .interface import set_pipeline_stage # noqa: F401 from .interface import set_pipeline_stage # noqa: F401
from .interface import ProcessMesh # noqa: F401 from .interface import ProcessMesh # noqa: F401
from .completion import complete_annotation # noqa: F401 from .completion import complete_annotation # noqa: F401
from .completion import complete_backward_annotation # noqa: F401
from .reshard import reshard # noqa: F401
__all__ = [] __all__ = []
...@@ -23,6 +23,7 @@ from .utils import compute_compatible_dims_mapping ...@@ -23,6 +23,7 @@ from .utils import compute_compatible_dims_mapping
from .utils import print_program_with_distributed_attr from .utils import print_program_with_distributed_attr
from .context import get_default_distributed_context from .context import get_default_distributed_context
from .operators import find_best_compatible_distributed_operator_impl from .operators import find_best_compatible_distributed_operator_impl
from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute
ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"]
...@@ -597,3 +598,172 @@ def complete_annotation(program, dist_context=None): ...@@ -597,3 +598,172 @@ def complete_annotation(program, dist_context=None):
dist_context.amend_distributed_attr_for_program() dist_context.amend_distributed_attr_for_program()
return program return program
def complete_backward_annotation(auto_parallel_main_prog, dist_context):
"""Complete the annotation of vars and ops in the backward phase for parallel program."""
def _is_grad_var_name(name):
if "@GRAD" in name:
return True
return False
grad_start_idx = None
for idx, op in enumerate(auto_parallel_main_prog.global_block().ops):
for var_name in op.output_arg_names:
# TODO: use _is_loss_op to judge
if "@GRAD" in var_name and op.type == "fill_constant":
grad_start_idx = idx
break
assert grad_start_idx is not None, "No backward procedure found in this program."
ops = list(auto_parallel_main_prog.global_block().ops)
vars = auto_parallel_main_prog.global_block().vars
for idx in range(grad_start_idx, len(ops)):
# complete the loss op
if idx == grad_start_idx:
grad_var = vars[ops[idx].output_arg_names[0]]
grad_var_name = grad_var.name
forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")]
forward_var = vars[forward_var_name]
tensor_attr = TensorDistributedAttribute(grad_var, dist_context)
process_mesh = dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_process_mesh()
dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_dims_mapping()
tensor_attr.set_dims_mapping(dims_mapping)
tensor_attr.set_process_mesh(process_mesh)
dist_context.set_tensor_distributed_attr_for_program(grad_var,
tensor_attr)
op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
op_attr.set_process_mesh(process_mesh)
dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
# in the data parallel mode, the loss op followed by scale op.
if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \
and grad_var_name in ops[idx + 1].output_arg_names:
op_attr = OperatorDistributedAttribute(ops[idx + 1],
dist_context)
op_attr.set_process_mesh(process_mesh)
dist_context.set_op_distributed_attr_for_program(ops[idx + 1],
op_attr)
continue
# complete the annotation of the optimizer op.
# TODO: use _is_optimizer_op to judge
if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names:
assert len(ops[idx].input(
"Param")) == 1, "Only support one-to-one now."
assert len(ops[idx].input(
"Grad")) == 1, "Only support one-to-one now."
var = vars[ops[idx].input("Param")[0]]
grad_var = vars[ops[idx].input("Grad")[0]]
process_mesh = dist_context.get_tensor_distributed_attr_for_program(
var).get_process_mesh()
dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
var).get_dims_mapping()
op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
op_attr.set_process_mesh(process_mesh)
op_attr.set_input_dims_mapping(grad_var.name, dims_mapping)
dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
continue
# complete the c_allreduce_sum op for gradient in the data parallel mode.
if ops[idx].type == "c_allreduce_sum" and ops[
idx].input_arg_names == ops[idx].output_arg_names:
grad_var = vars[ops[idx].output_arg_names[0]]
op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
process_mesh = dist_context.get_tensor_distributed_attr_for_program(
grad_var).get_process_mesh()
op_attr.set_process_mesh(process_mesh)
dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
continue
# complete the annotation of grad op
grad_op = ops[idx]
for i, op in enumerate(ops[:grad_start_idx]):
match_op = None
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
set(),
[])
grad_op_input = []
for input_arg_name in grad_op.desc.input_arg_names():
if "@GRAD" in input_arg_name:
name = input_arg_name[:input_arg_name.find("@GRAD") + 5]
grad_op_input.append(name)
else:
grad_op_input.append(input_arg_name)
# like sum op: the count of grad op will larger than 1
if len(grad_op_desc_list) > 1:
for grad_op_desc in grad_op_desc_list:
if grad_op_input == grad_op_desc.input_arg_names() \
and grad_op.desc.type() == grad_op_desc.type():
match_op = op
break
elif len(grad_op_desc_list) == 1:
if grad_op_input == grad_op_desc_list[0].input_arg_names() \
and grad_op.desc.type() == grad_op_desc_list[0].type():
match_op = op
if match_op is not None:
op_attr = dist_context.get_op_distributed_attr_for_program(op)
grad_op_attr = OperatorDistributedAttribute(grad_op,
dist_context)
grad_op_attr.set_process_mesh(op_attr.get_process_mesh())
for var_name in grad_op.input_arg_names:
if "@GRAD" in var_name:
dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
vars[var_name]).get_dims_mapping()
grad_op_attr.set_input_dims_mapping(var_name,
dims_mapping)
else:
dims_mapping = op_attr.get_input_dims_mapping(var_name)
grad_op_attr.set_input_dims_mapping(var_name,
dims_mapping)
dist_context.set_op_distributed_attr_for_program(grad_op,
grad_op_attr)
for var_name in grad_op.output_arg_names:
if "@GRAD" in var_name:
forward_var = vars[var_name[:var_name.find("@GRAD")]]
tensor_attr = TensorDistributedAttribute(vars[var_name],
dist_context)
process_mesh = grad_op_attr.get_process_mesh()
dims_mapping = grad_op_attr.get_input_dims_mapping(
forward_var.name)
tensor_attr.set_process_mesh(process_mesh)
tensor_attr.set_dims_mapping(dims_mapping)
dist_context.set_tensor_distributed_attr_for_program(
vars[var_name], tensor_attr)
break
# complete the annotation of sum op for multiple renamed grad var
if grad_op.type == "sum" and all(
map(_is_grad_var_name, grad_op.input_arg_names)):
assert len(grad_op.output_arg_names
) == 1, "The output count of sum op should be one."
grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context)
for var_name in grad_op.input_arg_names:
if "@GRAD" in var_name:
forward_var = vars[var_name[:var_name.find("@GRAD")]]
dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_dims_mapping()
grad_op_attr.set_input_dims_mapping(var_name, dims_mapping)
for var_name in grad_op.output_arg_names:
forward_var = vars[var_name[:var_name.find("@GRAD")]]
tensor_attr = TensorDistributedAttribute(vars[var_name],
dist_context)
process_mesh = dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_process_mesh()
dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_dims_mapping()
tensor_attr.set_dims_mapping(dims_mapping)
tensor_attr.set_process_mesh(process_mesh)
dist_context.set_tensor_distributed_attr_for_program(
vars[var_name], tensor_attr)
grad_op_attr.set_process_mesh(
dist_context.get_tensor_distributed_attr_for_program(
forward_var).get_process_mesh())
dist_context.set_op_distributed_attr_for_program(grad_op,
grad_op_attr)
...@@ -59,6 +59,9 @@ class DistributedContext: ...@@ -59,6 +59,9 @@ class DistributedContext:
if self._process_mesh.ndim == 1: if self._process_mesh.ndim == 1:
self._data_parallel_axis = 0 self._data_parallel_axis = 0
self._model_parallel_axis = 0 self._model_parallel_axis = 0
elif self._process_mesh.ndim == 3:
self._data_parallel_axis = 1
self._model_parallel_axis = 2
else: else:
self._data_parallel_axis = 0 self._data_parallel_axis = 0
self._model_parallel_axis = 1 self._model_parallel_axis = 1
......
...@@ -147,7 +147,17 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl): ...@@ -147,7 +147,17 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
process_mesh_shape) process_mesh_shape)
num_partition = process_mesh_shape[embedding_row_dim_mapping] num_partition = process_mesh_shape[embedding_row_dim_mapping]
# TODO generalize here, support any mesh group # TODO generalize here, support any mesh group
model_parallel_axis, process_mesh = op_dist_attr.get_owner_context(
)._get_model_parallel_info()
if mesh_shape == 1: if mesh_shape == 1:
if rank_id not in process_mesh_group:
assert len(
process_mesh.topology
) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \
but got {}".format(len(process_mesh.topology))
rank_id = process_mesh_group[
process_mesh.process_group.index(rank_id) %
process_mesh_shape[0]]
relative_idx = process_mesh_group.index(rank_id) relative_idx = process_mesh_group.index(rank_id)
else: else:
relative_idx = rank_id % num_partition relative_idx = rank_id % num_partition
...@@ -156,8 +166,6 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl): ...@@ -156,8 +166,6 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
relative_idx = relative_idx * per_part_size relative_idx = relative_idx * per_part_size
# TODO caculate ring id # TODO caculate ring id
model_parallel_axis, process_mesh = op_dist_attr.get_owner_context(
)._get_model_parallel_info()
group_ranks = _get_comm_group(process_mesh.process_group, group_ranks = _get_comm_group(process_mesh.process_group,
process_mesh.topology, process_mesh.topology,
model_parallel_axis, rank_id) model_parallel_axis, rank_id)
......
...@@ -17,9 +17,10 @@ from paddle.distributed.fleet import cloud_utils ...@@ -17,9 +17,10 @@ from paddle.distributed.fleet import cloud_utils
import paddle.fluid.core as core import paddle.fluid.core as core
from .context import DistributedContext from .context import DistributedContext
from .context import get_default_distributed_context from .context import get_default_distributed_context
from .completion import complete_annotation from .completion import complete_annotation, complete_backward_annotation
from .partitioner import Partitioner from .partitioner import Partitioner
from .process import get_all_process_groups from .process import get_all_process_groups
from .reshard import reshard
class AutoParallelizer: class AutoParallelizer:
...@@ -85,10 +86,16 @@ class AutoParallelizer: ...@@ -85,10 +86,16 @@ class AutoParallelizer:
# instantiate communication by process_mapping. # instantiate communication by process_mapping.
all_process_groups = get_all_process_groups() all_process_groups = get_all_process_groups()
for process_group in all_process_groups: for process_group in all_process_groups:
if rank not in process_group._ranks:
continue
process_group.instantiate() process_group.instantiate()
# The last step: remove all distributed attributes to be compatiable # The last step: remove all distributed attributes to be compatiable
# with inference. # with inference.
self._remove_distributed_attrs(partitioned_main_prog) self._remove_distributed_attrs(partitioned_main_prog)
complete_backward_annotation(partitioned_main_prog, self._dist_context)
reshard(partitioned_main_prog, partitioned_startup_prog, rank,
self._dist_context)
return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog
此差异已折叠。
...@@ -86,6 +86,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto) ...@@ -86,6 +86,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers) list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP}) list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach() endforeach()
...@@ -225,6 +229,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) ...@@ -225,6 +229,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
elseif(WITH_GPU) elseif(WITH_GPU)
if (${CUDNN_VERSION} VERSION_LESS 7100) if (${CUDNN_VERSION} VERSION_LESS 7100)
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
...@@ -589,6 +597,10 @@ if(WITH_DISTRIBUTE) ...@@ -589,6 +597,10 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
endif(NOT WIN32) endif(NOT WIN32)
endif(NOT APPLE) endif(NOT APPLE)
if(WITH_DGC) if(WITH_DGC)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.nn as nn
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
import paddle.distributed.auto_parallel as auto
from paddle.distributed.auto_parallel.context import DistributedContext
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.completion import complete_backward_annotation
from paddle.distributed.auto_parallel.reshard import reshard
paddle.enable_static()
_global_parallel_strategy = None
_global_process_mesh = None
ROOT_MESH = auto.ProcessMesh([0, 1])
PP_MESH_0 = None
PP_MESH_1 = None
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
def forward(self, input):
if _global_parallel_strategy == "pp":
auto.shard_tensor(
self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
auto.shard_tensor(
self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
else:
auto.shard_tensor(
self.linear0.weight, _global_process_mesh,
dim_mapping=[-1, -1])
auto.shard_tensor(
self.linear1.weight, _global_process_mesh,
dim_mapping=[-1, -1])
out = self.norm(input)
out = self.linear0(out)
out = F.gelu(out, approximate=True)
out = self.linear1(out)
return out
def mlp_forward(train_program, start_program):
with static.program_guard(train_program,
start_program), utils.unique_name.guard():
batch_size = 4
hidden_size = 1024
sequence_len = 512
input = static.data(
name="input", shape=[batch_size, hidden_size], dtype='float32')
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32')
if _global_parallel_strategy == "pp":
auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
elif _global_parallel_strategy == "dp":
auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
else:
auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
initializer_range=0.02)
predict = mlp(input)
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
return loss, train_program, start_program
def get_dist_prog(train_program, startup_program, dist_context, rank_id):
global _global_process_mesh
dist_context.set_process_mesh(_global_process_mesh)
loss, train_program, startup_program = mlp_forward(train_program,
startup_program)
# auto completion
complete_train_program = auto.complete_annotation(train_program,
dist_context)
dist_strategy = fleet.DistributedStrategy()
partitioner = Partitioner(dist_strategy, dist_context, rank_id)
# logical partition
auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
complete_train_program, startup_program)
dist_params_grads = partitioner.apply_backward(
loss, complete_train_program, startup_program, auto_parallel_main_prog,
auto_parallel_startup_prog)
optimizer = paddle.fluid.optimizer.AdamOptimizer()
opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
auto_parallel_main_prog,
auto_parallel_startup_prog)
return auto_parallel_main_prog, auto_parallel_startup_prog
def check_backward_dist_attr(dist_context, dist_main_prog, op_need_check):
has_dist_attr = True
vars = dist_main_prog.global_block().vars
op_dist_attr = dist_context.get_op_distributed_attr_for_program(
op_need_check)
if not op_dist_attr or not op_dist_attr.get_process_mesh():
has_dist_attr = False
for var_name in op_need_check.input_arg_names:
if not op_dist_attr.get_input_dims_mapping(var_name) or \
not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
has_dist_attr = False
break
if has_dist_attr:
for var_name in op_need_check.output_arg_names:
if not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
has_dist_attr = False
break
return has_dist_attr
def check_send_recv_result(dist_main_prog, rank_id):
send_result = False
recv_result = False
ops = dist_main_prog.global_block().ops
if rank_id == 0:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
0]:
recv_result = True
else:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
0]:
recv_result = True
return send_result and recv_result
def check_initialization(dist_startup_prog, rank_id):
if rank_id == 0:
need_check_params = [
"layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0",
"linear_0.b_0"
]
else:
need_check_params = ['linear_1.w_0', 'linear_1.b_0']
params = []
for var_name, var in dist_startup_prog.global_block().vars.items():
if var.is_parameter:
params.append(var_name)
return params == need_check_params
def check_initialization_for_dp(dist_startup_prog):
need_check_params = [
"layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", "linear_0.b_0"
] + ['linear_1.w_0', 'linear_1.b_0']
params = []
for var_name, var in dist_startup_prog.global_block().vars.items():
if var.is_parameter:
params.append(var_name)
broadcast_varnames = []
for op in dist_startup_prog.global_block().ops:
if op.type == "c_broadcast":
broadcast_varnames.append(op.output_arg_names[0])
return params == need_check_params == broadcast_varnames
class TestMLPReshard(unittest.TestCase):
def test_complete_backward_annotation(self):
global _global_process_mesh
_global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = DistributedContext()
rank_id = 0
dist_main_prog, dist_startup_prog = get_dist_prog(
train_program, startup_program, dist_context, 0)
complete_backward_annotation(dist_main_prog, dist_context)
op_need_check = None
for op in dist_main_prog.global_block().ops:
if op.type == "gelu_grad":
op_need_check = op
break
# grad op should have dist attr
self.assertTrue(
check_backward_dist_attr(dist_context, dist_main_prog,
op_need_check))
def test_mlp_pp(self):
global _global_parallel_strategy
_global_parallel_strategy = "pp"
global _global_process_mesh
_global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
global PP_MESH_0
PP_MESH_0 = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
global PP_MESH_1
PP_MESH_1 = auto.ProcessMesh(mesh=[1], parent=ROOT_MESH)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = DistributedContext()
rank_id = 1
dist_main_prog, dist_startup_prog = get_dist_prog(
train_program, startup_program, dist_context, rank_id)
complete_backward_annotation(dist_main_prog, dist_context)
reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
# check send and recv result
self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
# parameter initialization of every rank should be different in the pipeline scene
self.assertTrue(check_initialization(dist_startup_prog, rank_id))
def test_mlp_dp(self):
global _global_parallel_strategy
_global_parallel_strategy = "dp"
global _global_process_mesh
_global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = DistributedContext()
rank_id = 0
dist_main_prog, dist_startup_prog = get_dist_prog(
train_program, startup_program, dist_context, rank_id)
complete_backward_annotation(dist_main_prog, dist_context)
reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
# send and recv should not exist in dp scene.
self.assertFalse(check_send_recv_result(dist_main_prog, rank_id))
# all parameters should be initialized in dp scene
self.assertTrue(check_initialization_for_dp(dist_startup_prog))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.nn as nn
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
import paddle.distributed.auto_parallel as auto
from paddle.distributed.auto_parallel.context import DistributedContext
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.completion import complete_backward_annotation
from paddle.distributed.auto_parallel.reshard import reshard
paddle.enable_static()
_global_parallel_strategy = "dp_mp_pp"
ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
_global_process_mesh = auto.ProcessMesh(
[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH)
PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH)
PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH)
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
def forward(self, input):
auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1])
auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1])
out = self.norm(input)
out = self.linear0(out)
out = F.gelu(out, approximate=True)
out = self.linear1(out)
return out
def mlp_forward(train_program, start_program):
with static.program_guard(train_program,
start_program), utils.unique_name.guard():
batch_size = 4
hidden_size = 1024
sequence_len = 512
input = static.data(
name="input", shape=[batch_size, hidden_size], dtype='float32')
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32')
auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1])
auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1])
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
initializer_range=0.02)
predict = mlp(input)
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
return loss, train_program, start_program
def get_dist_prog(train_program, startup_program, dist_context, rank_id):
global _global_process_mesh
dist_context.set_process_mesh(_global_process_mesh)
loss, train_program, startup_program = mlp_forward(train_program,
startup_program)
# auto completion
complete_train_program = auto.complete_annotation(train_program,
dist_context)
dist_strategy = fleet.DistributedStrategy()
partitioner = Partitioner(dist_strategy, dist_context, rank_id)
# logical partition
auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
complete_train_program, startup_program)
dist_params_grads = partitioner.apply_backward(
loss, complete_train_program, startup_program, auto_parallel_main_prog,
auto_parallel_startup_prog)
optimizer = paddle.fluid.optimizer.AdamOptimizer()
opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
auto_parallel_main_prog,
auto_parallel_startup_prog)
return auto_parallel_main_prog, auto_parallel_startup_prog
def check_send_recv_result(dist_main_prog, rank_id):
send_result = False
recv_result = False
ops = dist_main_prog.global_block().ops
if rank_id in [0, 1, 4, 5]:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
0]:
recv_result = True
else:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
0]:
recv_result = True
return send_result and recv_result
def check_initialization_for_dpmppp(dist_startup_prog):
broadcast_varnames = []
for op in dist_startup_prog.global_block().ops:
if op.type == "c_broadcast":
broadcast_varnames.append(op.output_arg_names[0])
result = len(broadcast_varnames) > 0
return result
class TestMLPReshard(unittest.TestCase):
def test_mlp_dpmppp(self):
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = DistributedContext()
rank_id = 2
dist_main_prog, dist_startup_prog = get_dist_prog(
train_program, startup_program, dist_context, rank_id)
print(dist_main_prog)
complete_backward_annotation(dist_main_prog, dist_context)
reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
print(dist_main_prog)
print(dist_startup_prog)
# check send and recv result
self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
# check parameter initialization
self.assertTrue(check_initialization_for_dpmppp(dist_startup_prog))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.nn as nn
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
import paddle.distributed.auto_parallel as auto
from paddle.distributed.auto_parallel.context import DistributedContext
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.completion import complete_backward_annotation
from paddle.distributed.auto_parallel.reshard import reshard
paddle.enable_static()
_global_parallel_strategy = "mp_pp"
ROOT_MESH = auto.ProcessMesh([[0, 1], [2, 3]])
_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], parent=ROOT_MESH)
PP_MESH_0 = auto.ProcessMesh([0, 1], parent=ROOT_MESH)
PP_MESH_1 = auto.ProcessMesh([2, 3], parent=ROOT_MESH)
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.word_embeddings = nn.Embedding(
hidden_size,
hidden_size,
weight_attr=paddle.ParamAttr(
name="word_embeddings",
initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range)))
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.linear2 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
def forward(self, input):
auto.shard_tensor(
self.word_embeddings.weight, PP_MESH_0, dim_mapping=[0, -1])
auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 0])
auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[0, -1])
auto.shard_tensor(self.linear2.weight, PP_MESH_1, dim_mapping=[0, -1])
w_out = self.word_embeddings(input)
out = self.linear0(w_out)
gelu_out = F.gelu(out, approximate=True)
out = self.linear1(gelu_out)
out1 = self.linear2(gelu_out)
out = out + out1
return out
def mlp_forward(train_program, start_program):
with static.program_guard(train_program,
start_program), utils.unique_name.guard():
batch_size = 4
hidden_size = 1024
sequence_len = 512
input = static.data(name="input", shape=[batch_size], dtype='int32')
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32')
auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1])
auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
initializer_range=0.02)
predict = mlp(input)
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
return loss, train_program, start_program
def get_dist_prog(train_program, startup_program, dist_context, rank_id):
global _global_process_mesh
dist_context.set_process_mesh(_global_process_mesh)
loss, train_program, startup_program = mlp_forward(train_program,
startup_program)
# auto completion
complete_train_program = auto.complete_annotation(train_program,
dist_context)
dist_strategy = fleet.DistributedStrategy()
partitioner = Partitioner(dist_strategy, dist_context, rank_id)
# logical partition
auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
complete_train_program, startup_program)
dist_params_grads = partitioner.apply_backward(
loss, complete_train_program, startup_program, auto_parallel_main_prog,
auto_parallel_startup_prog)
optimizer = paddle.fluid.optimizer.AdamOptimizer()
opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
auto_parallel_main_prog,
auto_parallel_startup_prog)
return auto_parallel_main_prog, auto_parallel_startup_prog
def check_send_recv_result(dist_main_prog, rank_id):
send_result = False
recv_result = False
ops = dist_main_prog.global_block().ops
if rank_id in [0, 1]:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
0]:
recv_result = True
else:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[
0]:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
0]:
recv_result = True
return send_result and recv_result
def check_initialization_for_mppp(dist_startup_prog, rank_id):
if rank_id in [0, 1]:
need_check_params = []
else:
need_check_params = ["linear_1.b_0", "linear_2.b_0"]
broadcast_varnames = []
for op in dist_startup_prog.global_block().ops:
if op.type == "c_broadcast":
broadcast_varnames.append(op.output_arg_names[0])
return need_check_params == broadcast_varnames
def check_allgather(dist_main_program):
allgather_out = "x@RESHARD_0"
var_result = False
op_result = False
vars = dist_main_program.global_block().vars
if allgather_out in vars and vars[allgather_out].shape == (4, 4):
var_result = True
for op in dist_main_program.global_block().ops:
if op.type == "matmul_v2":
if allgather_out in op.input_arg_names:
op_result = True
return var_result and op_result
class TestMLPReshard(unittest.TestCase):
def test_mlp_mppp(self):
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = DistributedContext()
rank_id = 2
dist_main_prog, dist_startup_prog = get_dist_prog(
train_program, startup_program, dist_context, rank_id)
complete_backward_annotation(dist_main_prog, dist_context)
reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
# check send and recv result
self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
# parameter which not been sliced should be the same in the mp scene
self.assertTrue(
check_initialization_for_mppp(dist_startup_prog, rank_id))
def test_allgather(self):
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
process_mesh = auto.ProcessMesh(mesh=[0, 3], parent=ROOT_MESH)
with static.program_guard(train_program, startup_program):
x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
x = auto.shard_tensor(x, process_mesh, dim_mapping=[0, -1])
w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
w = auto.shard_tensor(w, process_mesh, dim_mapping=[-1, -1])
y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
x.name: [-1, -1],
w.name: [-1, -1]
}, **{"x": x,
"y": w})[0]
rank_id = 0
dist_context = DistributedContext()
dist_strategy = fleet.DistributedStrategy()
partitioner = Partitioner(dist_strategy, dist_context, rank_id)
complete_train_program = auto.complete_annotation(train_program,
dist_context)
auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
complete_train_program, startup_program)
reshard(auto_parallel_main_prog, startup_program, rank_id, dist_context)
# the x should not be slice
self.assertTrue(check_allgather(auto_parallel_main_prog))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import os
if os.getenv("CUDA_VISIBLE_DEVICES", None) is None:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import paddle
import paddle.nn as nn
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
import paddle.distributed.auto_parallel as auto
from paddle.distributed.auto_parallel.context import get_default_distributed_context
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.reshard import reshard
from paddle.distributed.auto_parallel.process import new_process_group
paddle.enable_static()
_global_parallel_strategy = None
_global_process_mesh = None
ROOT_MESH = auto.ProcessMesh([0])
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
def forward(self, input):
if _global_parallel_strategy == "pp":
auto.shard_tensor(
self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
auto.shard_tensor(
self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
else:
auto.shard_tensor(
self.linear0.weight, _global_process_mesh,
dim_mapping=[-1, -1])
auto.shard_tensor(
self.linear1.weight, _global_process_mesh,
dim_mapping=[-1, -1])
out = self.norm(input)
out = self.linear0(out)
out = F.gelu(out, approximate=True)
out = self.linear1(out)
return out
def mlp_forward(train_program, start_program):
with static.program_guard(train_program,
start_program), utils.unique_name.guard():
batch_size = 4
hidden_size = 1024
sequence_len = 512
input = static.data(
name="input", shape=[batch_size, hidden_size], dtype='float32')
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32')
if _global_parallel_strategy == "pp":
auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
elif _global_parallel_strategy == "dp":
auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
else:
auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
initializer_range=0.02)
predict = mlp(input)
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
return loss, train_program, start_program
def get_dist_prog_with_parallelizer(train_program, startup_program,
dist_context):
global _global_process_mesh
dist_strategy = fleet.DistributedStrategy()
dist_strategy.amp = False
dist_strategy.pipeline = False
dist_strategy.recompute = False
# init parallel optimizer
dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy)
loss, train_program, startup_program = mlp_forward(train_program,
startup_program)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
optimizer = fleet.distributed_optimizer(optimizer)
# fake a comm group
pg = new_process_group([3, 4])
_, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
loss, startup_program)
return distributed_main_program, distributed_startup_program
def check_send_recv_result(dist_main_prog, rank_id):
send_result = False
recv_result = False
ops = dist_main_prog.global_block().ops
if rank_id == 0:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
0]:
recv_result = True
else:
for idx, op in enumerate(ops):
if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
send_result = True
if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
0]:
recv_result = True
return send_result and recv_result
class TestMLPReshard(unittest.TestCase):
def test_mlp_serial(self):
global _global_parallel_strategy
_global_parallel_strategy = None
global _global_process_mesh
_global_process_mesh = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
dist_context = get_default_distributed_context()
rank_id = 0
dist_main_prog, dist_startup_prog = get_dist_prog_with_parallelizer(
train_program, startup_program, dist_context)
# send and recv should not exist in serial scene.
self.assertFalse(check_send_recv_result(dist_main_prog, rank_id))
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册