# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest import paddle from paddle.distributed.fleet import auto from paddle.fluid import program_guard from paddle.fluid.backward import append_backward paddle.enable_static() def make_program_dp2_axis_None(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor( x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None, None] ) tmp_0 = paddle.norm(x, p=2) return main_program, start_program, tmp_0 def make_program_dp2_axis_0(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor( x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None, None] ) tmp_0 = paddle.norm(x, p=2, axis=0) return main_program, start_program, tmp_0 def make_program_dp2_axis_1(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor( x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None, None] ) tmp_0 = paddle.norm(x, p=2, axis=1) return main_program, start_program, tmp_0 def make_program_serial(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor( x, auto.ProcessMesh([0], dim_names=["x"]), [None, None, None] ) tmp_0 = paddle.norm(x, p=2) return main_program, start_program, tmp_0 def parallelizer(program_func, rank): from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed.auto_parallel.partitioner import Partitioner main_program, start_program, loss = program_func() dist_context = DistributedContext() completer = Completer(dist_context) completer.complete_forward_annotation(main_program) dist_context.block_state.parse_forward_blocks(main_program) with program_guard(main_program, start_program): params_grads = append_backward( loss, distop_context=dist_context.dist_op_context ) completer.complete_backward_annotation(main_program) dist_context.block_state.parse_backward_blocks(main_program) partitioner = Partitioner(dist_context, rank) dist_main_prog, _, _ = partitioner.partition( main_program, start_program, [] ) return dist_main_prog, dist_context class TestDistPNorm(unittest.TestCase): def prepare(self, func): self.dist_main_prog, self.dist_context = parallelizer(func, 0) self.ops = self.dist_main_prog.global_block().ops def test_dist_pnorm(self): pass class TestDistPNormDP(TestDistPNorm): def test_dist_pnorm(self): self.prepare(make_program_dp2_axis_None) self.check_program() def check_program(self): op_types = [] for op in self.ops: op_types.append(op.type) op_dist_attr = self.dist_context.get_op_dist_attr_for_program(op) if op.type == "p_norm": assert op_dist_attr.impl_type == "p_norm" if op.type in ["p_norm", "p_norm_grad"]: for input_attr in op_dist_attr.inputs_dist_attrs.values(): assert set(input_attr.dims_mapping) == {-1} for output_attr in op_dist_attr.outputs_dist_attrs.values(): assert set(output_attr.dims_mapping) == {-1} if op.type == 'c_allgather': for input_attr in op_dist_attr.inputs_dist_attrs.values(): assert input_attr.dims_mapping[0] == 0 assert set(input_attr.dims_mapping[1:]) == {-1} for output_attr in op_dist_attr.outputs_dist_attrs.values(): assert set(output_attr.dims_mapping) == {-1} if op.type == 'slice': for input_attr in op_dist_attr.inputs_dist_attrs.values(): assert set(input_attr.dims_mapping) == {-1} for output_attr in op_dist_attr.outputs_dist_attrs.values(): assert output_attr.dims_mapping[0] == 0 assert set(output_attr.dims_mapping[1:]) == {-1} assert op_types == [ "c_allgather", "p_norm", "fill_constant", "p_norm_grad", "slice", ] class TestDistPNormDP1(TestDistPNormDP): def test_dist_pnorm(self): self.prepare(make_program_dp2_axis_0) self.check_program() class TestDistPNormSerial(TestDistPNorm): def test_dist_pnorm(self): self.prepare(make_program_serial) for op in self.ops: op_dist_attr = self.dist_context.get_op_dist_attr_for_program(op) assert op_dist_attr.impl_type == "default" class TestDistPNormDPAxis1(TestDistPNorm): def test_dist_pnorm(self): self.prepare(make_program_dp2_axis_1) for op in self.ops: op_dist_attr = self.dist_context.get_op_dist_attr_for_program(op) assert op_dist_attr.impl_type == "default" if __name__ == "__main__": unittest.main()