diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py index 72f89eb25f43f27c38b52fe20c975a7ad1b8d2b5..d08f1727f3ef7bd3064e93fc6637887afabc6c39 100644 --- a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py +++ b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py @@ -34,7 +34,6 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( GroupShardedScaler, ) -from paddle.fluid.framework import _test_eager_guard from paddle.nn import Linear epoch = 10 @@ -331,5 +330,4 @@ def test_stage2_stage3(): if __name__ == '__main__': - with _test_eager_guard(): - test_stage2_stage3() + test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py index b088bf424b85c5d6baf24c363bc3e8462e6f5f5a..651b0adf9e28429c48db367c27ee619b62b2d2fd 100644 --- a/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py @@ -29,7 +29,6 @@ from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimiz from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ( ShardingStage2, ) -from paddle.fluid.framework import _test_eager_guard from paddle.nn import Linear seed = 2022 @@ -248,7 +247,5 @@ def test_dp_stage2(): if __name__ == '__main__': - with _test_eager_guard(): - pass fleet.init(is_collective=True, strategy=strategy) test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py index ad2c91381adf80edc166b781440346d85913975b..c54101baec69212ec707eeb59b11ad12752aa7aa 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py @@ -20,7 +20,6 @@ import numpy as np import paddle from paddle.fluid import core from paddle.fluid.dygraph.parallel import ParallelEnv -from paddle.fluid.framework import _test_eager_guard class TestProcessGroupFp32(unittest.TestCase): @@ -35,154 +34,151 @@ class TestProcessGroupFp32(unittest.TestCase): self.shape = (2, 10, 5) def test_create_process_group_gloo(self): - with _test_eager_guard(): - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore( - "127.0.0.1", 6272, is_master, nranks, 30 - ) - pg = paddle.fluid.core.ProcessGroupGloo.create(store, rank, nranks) - - # test allreduce sum - # rank 0 - paddle.device.set_device('cpu') - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - sum_result = x + y - if rank == 0: - task = pg.allreduce(tensor_x) - task.wait() - np.testing.assert_equal(tensor_x, sum_result) - else: - task = pg.allreduce(tensor_y) - task.wait() - np.testing.assert_equal(tensor_y, sum_result) - - print("test allreduce sum api ok") - - # test allreduce max - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - max_result = paddle.maximum(tensor_x, tensor_y) - - if rank == 0: - task = pg.allreduce(tensor_x, core.ReduceOp.MAX) - task.wait() - assert np.array_equal(tensor_x, max_result) - else: - task = pg.allreduce(tensor_y, core.ReduceOp.MAX) - task.wait() - assert np.array_equal(tensor_y, max_result) - - print("test allreduce max api ok") - - # test broadcast - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - broadcast_result = paddle.assign(tensor_x) - if rank == 0: - task = pg.broadcast(tensor_x, 0) - assert np.array_equal(broadcast_result, tensor_x) - else: - task = pg.broadcast(tensor_y, 0) - assert np.array_equal(broadcast_result, tensor_y) - print("test broadcast api ok") - - # test barrier - # rank 0 - if pg.rank() == 0: - task = pg.barrier() - task.wait() - # rank 1 - else: - task = pg.barrier() - task.wait() - - print("test barrier api ok\n") - - # test allgather - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - out_shape = list(self.shape) - out_shape[0] *= 2 - out = np.random.random(out_shape).astype(self.dtype) - tensor_out = paddle.to_tensor(out) - if pg.rank() == 0: - task = pg.all_gather(tensor_x, tensor_out) - task.wait() - paddle.device.cuda.synchronize() - # rank 1 - else: - task = pg.all_gather(tensor_y, tensor_out) - task.wait() - out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) - out_2 = paddle.slice( - tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] - ) - assert np.array_equal(tensor_x, out_1) - assert np.array_equal(tensor_y, out_2) - print("test allgather api ok\n") - - # test Reduce - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - sum_result = tensor_x + tensor_y - if pg.rank() == 0: - task = pg.reduce(tensor_x, 0) - task.wait() - # rank 1 - else: - task = pg.reduce(tensor_y, 0) - task.wait() - if pg.rank() == 0: - assert np.array_equal(tensor_x, sum_result) - print("test reduce sum api ok\n") - - # test Scatter - # rank 0 - in_shape = list(self.shape) - in_shape[0] *= 2 - x = np.random.random(in_shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - if pg.rank() == 0: - task = pg.scatter(tensor_x, tensor_y, 0) - task.wait() - # rank 1 - else: - task = pg.scatter(tensor_x, tensor_y, 0) - task.wait() - out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) - out2 = paddle.slice( - tensor_x, [0], [self.shape[0]], [self.shape[0] * 2] - ) - if pg.rank() == 0: - assert np.array_equal(tensor_y, out1) - else: - assert np.array_equal(tensor_y, out2) - print("test scatter api ok\n") + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore( + "127.0.0.1", 6272, is_master, nranks, 30 + ) + pg = paddle.fluid.core.ProcessGroupGloo.create(store, rank, nranks) + + # test allreduce sum + # rank 0 + paddle.device.set_device('cpu') + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = x + y + if rank == 0: + task = pg.allreduce(tensor_x) + task.wait() + np.testing.assert_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + np.testing.assert_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + # test allreduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if rank == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if rank == 0: + task = pg.broadcast(tensor_x, 0) + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + assert np.array_equal(broadcast_result, tensor_y) + print("test broadcast api ok") + + # test barrier + # rank 0 + if pg.rank() == 0: + task = pg.barrier() + task.wait() + # rank 1 + else: + task = pg.barrier() + task.wait() + + print("test barrier api ok\n") + + # test allgather + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice( + tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] + ) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py b/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py index 2fb9deb110e2a520bdb86756e38b6be0d12c6579..f2fc9c498b4e82b2511f6c607f21bc5ec4f4f728 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py @@ -28,7 +28,7 @@ from paddle.distributed.collective import ( _set_group_map_by_name, ) from paddle.fluid import core -from paddle.fluid.framework import _set_expected_place, _test_eager_guard +from paddle.fluid.framework import _set_expected_place ctypes.CDLL("libmpi.so", mode=ctypes.RTLD_GLOBAL) @@ -444,51 +444,49 @@ class TestProcessGroup(unittest.TestCase): self.shape = (2, 10, 5) def test_create_process_group_mpi(self): - with _test_eager_guard(): - group = init_process_group() - pg = group.process_group + group = init_process_group() + pg = group.process_group + # test allreduce sum + test_allreduce_sum(pg, self.shape, self.dtype) - # test allreduce sum - test_allreduce_sum(pg, self.shape, self.dtype) + # test allreduce max + test_allreduce_max(pg, self.shape, self.dtype) - # test allreduce max - test_allreduce_max(pg, self.shape, self.dtype) + # test allreduce min + test_allreduce_min(pg, self.shape, self.dtype) - # test allreduce min - test_allreduce_min(pg, self.shape, self.dtype) + # test allreduce prod + test_allreduce_prod(pg, self.shape, self.dtype) - # test allreduce prod - test_allreduce_prod(pg, self.shape, self.dtype) + # test broadcast + test_broadcast(pg, self.shape, self.dtype) - # test broadcast - test_broadcast(pg, self.shape, self.dtype) + # test barrier + test_barrair(pg) - # test barrier - test_barrair(pg) + # test allgather + test_allgather(pg, self.shape, self.dtype) - # test allgather - test_allgather(pg, self.shape, self.dtype) + # test alltoall + test_all2all(pg, self.shape, self.dtype) - # test alltoall - test_all2all(pg, self.shape, self.dtype) + # test Reduce + test_reduce_sum(pg, self.shape, self.dtype) - # test Reduce - test_reduce_sum(pg, self.shape, self.dtype) + # test reduce max + test_reduce_max(pg, self.shape, self.dtype) - # test reduce max - test_reduce_max(pg, self.shape, self.dtype) + # test reduce min + test_reduce_min(pg, self.shape, self.dtype) - # test reduce min - test_reduce_min(pg, self.shape, self.dtype) + # test reduce product + test_reduce_prod(pg, self.shape, self.dtype) - # test reduce product - test_reduce_prod(pg, self.shape, self.dtype) + # test Scatter + test_scatter(pg, self.shape, self.dtype) - # test Scatter - test_scatter(pg, self.shape, self.dtype) - - # test send recv. - test_send_recv(pg, group, self.shape, self.dtype) + # test send recv. + test_send_recv(pg, group, self.shape, self.dtype) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py index 2e585e1cc6d716b1c9bc987e4a7ea5d4d969f7a2..ba4ec91a80403957031e526a7103458801e0d9ad 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py @@ -20,7 +20,6 @@ import numpy as np import paddle import paddle.distributed as dist from paddle.fluid.dygraph.parallel import ParallelEnv -from paddle.fluid.framework import _test_eager_guard def init_process_group(strategy=None): @@ -44,459 +43,428 @@ class TestProcessGroupFp32(unittest.TestCase): self.shape = (2, 10, 5) def test_create_process_group_nccl(self): - with _test_eager_guard(): - device_id = paddle.distributed.ParallelEnv().dev_id - paddle.set_device('gpu:%d' % device_id) - - pg = init_process_group() - print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name()) - print("test new group api ok") - - # test allreduce sum - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - sum_result = tensor_x + tensor_y - if pg.rank() == 0: - task = dist.all_reduce(tensor_x) - assert np.array_equal(tensor_x, sum_result) - else: - task = dist.all_reduce(tensor_y) - assert np.array_equal(tensor_y, sum_result) - - print("test allreduce sum api ok") - - # test allreduce max - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - max_result = paddle.maximum(tensor_x, tensor_y) - - if pg.rank() == 0: - task = dist.all_reduce( - tensor_x, dist.ReduceOp.MAX, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, max_result) - else: - task = dist.all_reduce( - tensor_y, dist.ReduceOp.MAX, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_y, max_result) - - print("test allreduce max api ok") - - # test allreduce min - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - min_result = paddle.minimum(tensor_x, tensor_y) - - if pg.rank() == 0: - task = dist.all_reduce( - tensor_x, dist.ReduceOp.MIN, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, min_result) - else: - task = dist.all_reduce( - tensor_y, dist.ReduceOp.MIN, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_y, min_result) - - print("test allreduce min api ok") - - # test allreduce prod - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - prod_result = np.multiply(x, y) - - if pg.rank() == 0: - task = dist.all_reduce( - tensor_x, dist.ReduceOp.PROD, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, prod_result) - else: - task = dist.all_reduce( - tensor_y, dist.ReduceOp.PROD, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_y, prod_result) - - print("test allreduce prod api ok") - - # test broadcast - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - broadcast_result = paddle.assign(tensor_x) - if pg.rank() == 0: - task = dist.broadcast(tensor_x, 0, sync_op=False) - task.synchronize() - paddle.device.cuda.synchronize() - assert task.is_completed() - assert np.array_equal(broadcast_result, tensor_x) - else: - task = dist.broadcast(tensor_y, 0) - paddle.device.cuda.synchronize() - assert np.array_equal(broadcast_result, tensor_y) - - print("test broadcast api ok") - - # test broadcast with shape=[] - # rank 0 - x = np.random.random([]).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random([]).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - broadcast_result = paddle.assign(tensor_x) - if pg.rank() == 0: - task = dist.broadcast(tensor_x, 0, sync_op=False) - task.synchronize() - paddle.device.cuda.synchronize() - assert task.is_completed() - assert np.array_equal(broadcast_result, tensor_x) - else: - task = dist.broadcast(tensor_y, 0) - paddle.device.cuda.synchronize() - assert np.array_equal(broadcast_result, tensor_y) - assert tensor_y.shape == [] - - print("test broadcast api with shape=[] ok") - - # test barrier - # rank 0 - if pg.rank() == 0: - pg.barrier(device_id) - # rank 1 - else: - task = pg.barrier(device_id) - task.wait() - - print("test barrier api ok\n") - - # test allgather - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - out_shape = list(self.shape) - out_shape[0] *= 2 - out = np.random.random(out_shape).astype(self.dtype) - tensor_out = paddle.to_tensor(out) - if pg.rank() == 0: - task = pg.all_gather(tensor_x, tensor_out) - task.wait() - paddle.device.cuda.synchronize() - # rank 1 - else: - tensor_out_list = [ - paddle.empty_like(tensor_x), - paddle.empty_like(tensor_x), - ] - task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False) - paddle.device.cuda.synchronize() - tensor_out = paddle.concat(tensor_out_list) - out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) - out_2 = paddle.slice( - tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] - ) - assert np.array_equal(tensor_x, out_1) - assert np.array_equal(tensor_y, out_2) - print("test allgather api ok\n") - - if pg.rank() == 0: - task = pg.all_gather(tensor_x, tensor_out) - task.wait() - paddle.device.cuda.synchronize() - # rank 1 - else: - tensor_out_list = [] - task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False) - paddle.device.cuda.synchronize() - tensor_out = paddle.concat(tensor_out_list) - out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) - out_2 = paddle.slice( - tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] - ) - assert np.array_equal(tensor_x, out_1) - assert np.array_equal(tensor_y, out_2) - print("test allgather api2 ok\n") - - # test alltoall - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - out1 = np.random.random(self.shape).astype(self.dtype) - out2 = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - tensor_out1 = paddle.to_tensor(out1) - tensor_out2 = paddle.to_tensor(out2) - raw_tensor_x_2 = paddle.slice( - tensor_x, [0], [self.shape[0] // 2], [self.shape[0]] - ) - raw_tensor_y_1 = paddle.slice( - tensor_y, [0], [0], [self.shape[0] // 2] - ) - if pg.rank() == 0: - task = pg.alltoall(tensor_x, tensor_out1) - task.wait() - # rank 1 - else: - in_1, in_2 = paddle.split(tensor_y, 2) - out_1, out_2 = paddle.split(tensor_out2, 2) - out_tensor_list = [out_1, out_2] - task = dist.alltoall([in_1, in_2], out_tensor_list) - paddle.device.cuda.synchronize() - tensor_out2 = paddle.concat(out_tensor_list) - out1_2 = paddle.slice( - tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]] - ) - out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) - if pg.rank() == 0: - assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) - else: - assert np.array_equal(out2_1, raw_tensor_x_2) - print("test alltoall api ok\n") - - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - out1 = np.random.random(self.shape).astype(self.dtype) - out2 = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - tensor_out1 = paddle.to_tensor(out1) - tensor_out2 = paddle.to_tensor(out2) - raw_tensor_x_2 = paddle.slice( - tensor_x, [0], [self.shape[0] // 2], [self.shape[0]] - ) - raw_tensor_y_1 = paddle.slice( - tensor_y, [0], [0], [self.shape[0] // 2] - ) - if pg.rank() == 0: - task = pg.alltoall(tensor_x, tensor_out1) - task.wait() - # rank 1 - else: - in_1, in_2 = paddle.split(tensor_y, 2) - out_1, out_2 = paddle.split(tensor_out2, 2) - out_tensor_list = [] - task = dist.alltoall([in_1, in_2], out_tensor_list) - paddle.device.cuda.synchronize() - tensor_out2 = paddle.concat(out_tensor_list) - out1_2 = paddle.slice( - tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]] - ) - out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) - if pg.rank() == 0: - assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) - else: - assert np.array_equal(out2_1, raw_tensor_x_2) - print("test alltoall api2 ok\n") - - # test Reduce - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - sum_result = tensor_x + tensor_y - if pg.rank() == 0: - task = dist.reduce(tensor_x, 0, sync_op=True) - paddle.device.cuda.synchronize() - # rank 1 - else: - task = dist.reduce(tensor_y, 0, sync_op=False) - task.wait() - paddle.device.cuda.synchronize() - if pg.rank() == 0: - assert np.array_equal(tensor_x, sum_result) - print("test reduce sum api ok\n") - - # test reduce max - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - max_result = paddle.maximum(tensor_x, tensor_y) - - if pg.rank() == 0: - task = dist.reduce( - tensor_x, 0, dist.ReduceOp.MAX, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, max_result) - else: - task = dist.reduce( - tensor_y, 0, dist.ReduceOp.MAX, sync_op=False - ) - task.wait() - - print("test reduce max api ok") - - # test reduce min - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - min_result = paddle.minimum(tensor_x, tensor_y) - - if pg.rank() == 0: - task = dist.reduce( - tensor_x, 0, dist.ReduceOp.MIN, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, min_result) - else: - task = dist.reduce( - tensor_y, 0, dist.ReduceOp.MIN, sync_op=False - ) - task.wait() - - print("test reduce min api ok") - - # test reduce product - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - prod_result = np.multiply(x, y) - - if pg.rank() == 0: - task = dist.reduce( - tensor_x, 0, dist.ReduceOp.PROD, sync_op=False - ) - task.wait() - assert np.array_equal(tensor_x, prod_result) - else: - task = dist.reduce( - tensor_y, 0, dist.ReduceOp.PROD, sync_op=False - ) - task.wait() - - print("test reduce prod api ok") - # test Scatter - # rank 0 - in_shape = list(self.shape) - in_shape[0] *= 2 - x = np.random.random(in_shape).astype(self.dtype) - y = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - if pg.rank() == 0: - in_1, in_2 = paddle.split(tensor_x, 2) - task = dist.scatter(tensor_y, [in_1, in_2], 0, sync_op=True) - # task.wait() - paddle.device.cuda.synchronize() - # rank 1 - else: - task = dist.scatter(tensor_y, [], 0, sync_op=False) - task.wait() - paddle.device.cuda.synchronize() - out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) - out2 = paddle.slice( - tensor_x, [0], [self.shape[0]], [self.shape[0] * 2] - ) - if pg.rank() == 0: - assert np.array_equal(tensor_y, out1) - else: - assert np.array_equal(tensor_y, out2) - print("test scatter api ok\n") - - # test Scatter with shape=[] - # rank 0 - x = np.random.random([]).astype(self.dtype) - y = np.random.random([]).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - tensor_y = paddle.to_tensor(y) - if pg.rank() == 0: - in_1, in_2 = tensor_x, tensor_x + 1 - task = dist.scatter(tensor_y, [in_1, in_2], 0, sync_op=True) - paddle.device.cuda.synchronize() - # rank 1 - else: - task = dist.scatter(tensor_y, [], 0, sync_op=True) - task.wait() - paddle.device.cuda.synchronize() - out1 = paddle.assign(tensor_x) - out2 = paddle.assign(tensor_x + 1) - if pg.rank() == 0: - assert np.array_equal(tensor_y, out1) - else: - assert np.array_equal(tensor_y, out2), f"{tensor_y}, {out2}" - assert tensor_y.shape == [] - print("test scatter api with shape=[] ok\n") - - # test send min - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - if pg.rank() == 0: - task = dist.send(tensor_x, 1, sync_op=False) - task.wait() - else: - task = dist.recv(tensor_y, 0, sync_op=False) - task.wait() - assert np.array_equal(tensor_y, tensor_x) - - print("test send api ok") - - # test send min - # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) - # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - - if pg.rank() == 0: - task = dist.send(tensor_x, 1, sync_op=True) - else: - task = dist.recv(tensor_y, 0, sync_op=True) - assert np.array_equal(tensor_y, tensor_x) - - print("test send api ok") + device_id = paddle.distributed.ParallelEnv().dev_id + paddle.set_device('gpu:%d' % device_id) + + pg = init_process_group() + print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name()) + print("test new group api ok") + + # test allreduce sum + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = dist.all_reduce(tensor_x) + assert np.array_equal(tensor_x, sum_result) + else: + task = dist.all_reduce(tensor_y) + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + # test allreduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.all_reduce(tensor_x, dist.ReduceOp.MAX, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = dist.all_reduce(tensor_y, dist.ReduceOp.MAX, sync_op=False) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test allreduce min + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + min_result = paddle.minimum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.all_reduce(tensor_x, dist.ReduceOp.MIN, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, min_result) + else: + task = dist.all_reduce(tensor_y, dist.ReduceOp.MIN, sync_op=False) + task.wait() + assert np.array_equal(tensor_y, min_result) + + print("test allreduce min api ok") + + # test allreduce prod + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + prod_result = np.multiply(x, y) + + if pg.rank() == 0: + task = dist.all_reduce(tensor_x, dist.ReduceOp.PROD, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, prod_result) + else: + task = dist.all_reduce(tensor_y, dist.ReduceOp.PROD, sync_op=False) + task.wait() + assert np.array_equal(tensor_y, prod_result) + + print("test allreduce prod api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if pg.rank() == 0: + task = dist.broadcast(tensor_x, 0, sync_op=False) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = dist.broadcast(tensor_y, 0) + paddle.device.cuda.synchronize() + assert np.array_equal(broadcast_result, tensor_y) + + print("test broadcast api ok") + + # test broadcast with shape=[] + # rank 0 + x = np.random.random([]).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random([]).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if pg.rank() == 0: + task = dist.broadcast(tensor_x, 0, sync_op=False) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = dist.broadcast(tensor_y, 0) + paddle.device.cuda.synchronize() + assert np.array_equal(broadcast_result, tensor_y) + assert tensor_y.shape == [] + + print("test broadcast api with shape=[] ok") + + # test barrier + # rank 0 + if pg.rank() == 0: + pg.barrier(device_id) + # rank 1 + else: + task = pg.barrier(device_id) + task.wait() + + print("test barrier api ok\n") + + # test allgather + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + tensor_out_list = [ + paddle.empty_like(tensor_x), + paddle.empty_like(tensor_x), + ] + task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False) + paddle.device.cuda.synchronize() + tensor_out = paddle.concat(tensor_out_list) + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice( + tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] + ) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + tensor_out_list = [] + task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False) + paddle.device.cuda.synchronize() + tensor_out = paddle.concat(tensor_out_list) + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice( + tensor_out, [0], [out_shape[0] // 2], [out_shape[0]] + ) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api2 ok\n") + + # test alltoall + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice( + tensor_x, [0], [self.shape[0] // 2], [self.shape[0]] + ) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + task = pg.alltoall(tensor_x, tensor_out1) + task.wait() + # rank 1 + else: + in_1, in_2 = paddle.split(tensor_y, 2) + out_1, out_2 = paddle.split(tensor_out2, 2) + out_tensor_list = [out_1, out_2] + task = dist.alltoall([in_1, in_2], out_tensor_list) + paddle.device.cuda.synchronize() + tensor_out2 = paddle.concat(out_tensor_list) + out1_2 = paddle.slice( + tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]] + ) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api ok\n") + + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice( + tensor_x, [0], [self.shape[0] // 2], [self.shape[0]] + ) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + task = pg.alltoall(tensor_x, tensor_out1) + task.wait() + # rank 1 + else: + in_1, in_2 = paddle.split(tensor_y, 2) + out_1, out_2 = paddle.split(tensor_out2, 2) + out_tensor_list = [] + task = dist.alltoall([in_1, in_2], out_tensor_list) + paddle.device.cuda.synchronize() + tensor_out2 = paddle.concat(out_tensor_list) + out1_2 = paddle.slice( + tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]] + ) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api2 ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, sync_op=True) + paddle.device.cuda.synchronize() + # rank 1 + else: + task = dist.reduce(tensor_y, 0, sync_op=False) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test reduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.MAX, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.MAX, sync_op=False) + task.wait() + + print("test reduce max api ok") + + # test reduce min + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + min_result = paddle.minimum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.MIN, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, min_result) + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.MIN, sync_op=False) + task.wait() + + print("test reduce min api ok") + + # test reduce product + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + prod_result = np.multiply(x, y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.PROD, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, prod_result) + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.PROD, sync_op=False) + task.wait() + + print("test reduce prod api ok") + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + in_1, in_2 = paddle.split(tensor_x, 2) + task = dist.scatter(tensor_y, [in_1, in_2], 0, sync_op=True) + # task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = dist.scatter(tensor_y, [], 0, sync_op=False) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") + + # test Scatter with shape=[] + # rank 0 + x = np.random.random([]).astype(self.dtype) + y = np.random.random([]).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + in_1, in_2 = tensor_x, tensor_x + 1 + task = dist.scatter(tensor_y, [in_1, in_2], 0, sync_op=True) + paddle.device.cuda.synchronize() + # rank 1 + else: + task = dist.scatter(tensor_y, [], 0, sync_op=True) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.assign(tensor_x) + out2 = paddle.assign(tensor_x + 1) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2), f"{tensor_y}, {out2}" + assert tensor_y.shape == [] + print("test scatter api with shape=[] ok\n") + + # test send min + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + if pg.rank() == 0: + task = dist.send(tensor_x, 1, sync_op=False) + task.wait() + else: + task = dist.recv(tensor_y, 0, sync_op=False) + task.wait() + assert np.array_equal(tensor_y, tensor_x) + + print("test send api ok") + + # test send min + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + if pg.rank() == 0: + task = dist.send(tensor_x, 1, sync_op=True) + else: + task = dist.recv(tensor_y, 0, sync_op=True) + assert np.array_equal(tensor_y, tensor_x) + + print("test send api ok") class TestProcessGroupFp16(TestProcessGroupFp32):