diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 3c229746573da0f551160cb77892b87481a9ca40..c87316ed8f381547eb5636b7189d3c7395aab52d 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -28,5 +28,6 @@ from . import dist_check_finite_and_unscale from . import dist_update_loss_scaling from . import dist_split from . import dist_fill_constant_batch_size_like +from . import dist_slice from . import dist_fused_feedforward from . import dist_fused_attention diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/operators/dist_slice.py new file mode 100644 index 0000000000000000000000000000000000000000..4bc0a471dcf1c29ddc4d2912a8a4f34cc687343c --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py @@ -0,0 +1,122 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard +from ..utils import compute_compatible_and_update_dim_mapping +from .dist_default import DistributedDefaultImpl0 + + +class DistributedSlice(DistributedOperatorImplContainer): + def __init__(self, op_type): + super(DistributedSlice, self).__init__(op_type) + + +register_distributed_operator_impl_container(DistributedSlice("slice")) + + +class DistributedSliceImpl(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedSliceImpl, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + in_name = op_desc.input('Input')[0] + axes = op_desc.attr('axes') + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + for axis in axes: + if is_dim_shard(in_dims_mapping[axis]): + return False + return True + + def is_output_compatible(self, dist_op): + return True + + def is_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + in_name = op_desc.input('Input')[0] + out_name = op_desc.output('Out')[0] + decrease_axis = op_desc.attr('decrease_axis') + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + if len(in_dims_mapping) - len(decrease_axis) != 0 and len( + out_dims_mapping) != len(in_dims_mapping) - len(decrease_axis): + return False + + new_out_dims_mapping = [] + for i in range(len(in_dims_mapping)): + if i not in decrease_axis: + new_out_dims_mapping.append(in_dims_mapping[i]) + if new_out_dims_mapping == []: + new_out_dims_mapping = [-1] + if new_out_dims_mapping != out_dims_mapping: + return False + + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)) or \ + (not self.is_compatible(dist_op)): + return False + + return True + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + in_name = op_desc.input('Input')[0] + out_name = op_desc.output('Out')[0] + decrease_axis = op_desc.attr('decrease_axis') + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + ref_dims_mapping = [] + for i in range(len(in_dims_mapping)): + if i not in decrease_axis: + ref_dims_mapping.append(in_dims_mapping[i]) + if ref_dims_mapping == []: + ref_dims_mapping = [-1] + + assert len(ref_dims_mapping) == len(out_dims_mapping) + for i in range(len(out_dims_mapping)): + if out_dims_mapping[i] != ref_dims_mapping[i]: + out_dims_mapping[i] = ref_dims_mapping[i] + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl("slice", + DistributedSliceImpl("decrease_in_axis")) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 87031fe09e5a8ca613ad9c919a5e5b815559e475..46b1cffa54394aa01501e995c1512773e1197197 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -18,5 +18,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS}) + py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS}) py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py new file mode 100644 index 0000000000000000000000000000000000000000..6cf4621dbb0ce840515475b0e82e10605a7e3f06 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import paddle.distributed.auto_parallel as auto + +paddle.enable_static() + + +def make_program_dp2(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') + auto.shard_tensor( + x, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1, -1] + }) + tmp_0 = x[0] + tmp_1 = x[:, 0, :] + tmp_2 = x[:, :, 1] + tmp_3 = x[:2, :2, :2] + return main_program, start_program + + +def make_program_serial(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') + auto.shard_tensor( + x, + dist_attr={ + "process_mesh": auto.ProcessMesh([0]), + "dims_mapping": [-1, -1, -1] + }) + tmp_0 = x[0] + tmp_1 = x[:, 0, :] + tmp_2 = x[:, :, 1] + tmp_3 = x[2, 2, :] + tmp_4 = x[:2, :2, :2] + tmp_5 = x[0, 0, 0] + return main_program, start_program + + +def parallelizer(program_func, rank): + from paddle.distributed.auto_parallel.completion import Completer + from paddle.distributed.auto_parallel.partitioner import Partitioner + from paddle.distributed.auto_parallel.dist_context import DistributedContext + + main_program, start_program = program_func() + + dist_context = DistributedContext() + completer = Completer(dist_context) + completer.complete_forward_annotation(main_program) + + dist_context.block_state.parse_forward_blocks(main_program) + partitioner = Partitioner(dist_context, rank) + dist_main_prog, _, _ = partitioner.partition(main_program, start_program, + []) + + return dist_main_prog, dist_context + + +class TestDistSlice(unittest.TestCase): + def test_dist_slice_dp2(self): + + for rank in range(2): + dist_main_prog, dist_context = parallelizer(make_program_dp2, rank) + ops = dist_main_prog.global_block().ops + for op in ops: + axes = op.desc.attr('axes') + op_dist_attr = dist_context.get_op_dist_attr_for_program(op) + if axes[0] == 0: + assert op_dist_attr.impl_type == "default" + else: + assert op_dist_attr.impl_type == "slice" + for out in op.output_arg_names: + var_dims_mapping = op_dist_attr.get_output_dims_mapping( + out) + assert var_dims_mapping[0] == 0 + + def test_dist_slice_serial(self): + dist_main_prog, dist_context = parallelizer(make_program_serial, 0) + ops = dist_main_prog.global_block().ops + for op in ops: + op_dist_attr = dist_context.get_op_dist_attr_for_program(op) + assert op_dist_attr.impl_type == "slice" + for out in op.output_arg_names: + var_dims_mapping = op_dist_attr.get_output_dims_mapping(out) + ref_dims_mapping = [-1 for i in range(len(var_dims_mapping))] + assert ref_dims_mapping == ref_dims_mapping + + +if __name__ == "__main__": + unittest.main()