process_group_mpi.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import random
import numpy as np
import os
import shutil

import paddle
from paddle.fluid import core
from datetime import timedelta
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.distributed.collective import Group
from paddle.distributed.collective import _group_map_by_name
from paddle.distributed.collective import _default_group_name
from paddle.distributed.collective import _set_group_map
from paddle.distributed.collective import _set_group_map_by_name
from paddle.distributed.collective import _set_group_map_backend
from paddle.fluid.framework import _set_expected_place
import paddle.distributed as dist
import ctypes

ctypes.CDLL("libmpi.so", mode=ctypes.RTLD_GLOBAL)


def init_process_group(strategy=None):
    gid = 0
    pg = core.ProcessGroupMPI.create([], gid)
    rank = pg.get_rank()
    world_size = pg.get_world_size()

    # support CPU
    place = core.CPUPlace()
    _set_expected_place(place)

    group = Group(rank,
                  world_size,
                  id=0,
                  ranks=list(range(world_size)),
                  pg=pg,
                  name=_default_group_name)
    _set_group_map_by_name(_default_group_name, group)
    _set_group_map(gid, group)
    _set_group_map_backend(group, "mpi")

    return group


def test_allreduce_sum(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    sum_result = tensor_x + tensor_y
    if pg.rank() == 0:
        task = dist.all_reduce(tensor_x)
        assert np.array_equal(tensor_x, sum_result)
    else:
        task = dist.all_reduce(tensor_y)
        assert np.array_equal(tensor_y, sum_result)
    print("test allreduce sum api ok")


def test_allreduce_max(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    max_result = paddle.maximum(tensor_x, tensor_y)

    if pg.rank() == 0:
        task = dist.all_reduce(tensor_x,
                               dist.ReduceOp.MAX,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, max_result)
    else:
        task = dist.all_reduce(tensor_y,
                               dist.ReduceOp.MAX,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_y, max_result)
    print("test allreduce max api ok")


def test_allreduce_min(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    min_result = paddle.minimum(tensor_x, tensor_y)

    if pg.rank() == 0:
        task = dist.all_reduce(tensor_x,
                               dist.ReduceOp.MIN,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, min_result)
    else:
        task = dist.all_reduce(tensor_y,
                               dist.ReduceOp.MIN,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_y, min_result)
    print("test allreduce min api ok")


def test_allreduce_prod(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    prod_result = np.multiply(x, y)

    if pg.rank() == 0:
        task = dist.all_reduce(tensor_x,
                               dist.ReduceOp.PROD,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, prod_result)
    else:
        task = dist.all_reduce(tensor_y,
                               dist.ReduceOp.PROD,
                               use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_y, prod_result)
    print("test allreduce prod api ok")


def test_broadcast(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    broadcast_result = paddle.assign(tensor_x)
    if pg.rank() == 0:
        task = dist.broadcast(tensor_x, 0, use_calc_stream=False)
        task.synchronize()
        assert task.is_completed()
        assert np.array_equal(broadcast_result, tensor_x)
    else:
        task = dist.broadcast(tensor_y, 0)
        assert np.array_equal(broadcast_result, tensor_y)
    print("test broadcast api ok")


def test_barrair(pg):
    # rank 0
    if pg.rank() == 0:
        dist.barrier()
    # rank 1
    else:
        task = pg.barrier()
        task.wait()
    print("test barrier api ok\n")


def test_allgather(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    y = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    tensor_y = paddle.to_tensor(y)
    out_shape = list(shape)
    out_shape[0] *= 2
    out = np.random.random(out_shape).astype(dtype)
    tensor_out = paddle.to_tensor(out)
    if pg.rank() == 0:
        task = pg.all_gather(tensor_x, tensor_out)
        task.wait()
    # rank 1
    else:
        tensor_out_list = [
            paddle.empty_like(tensor_x),
            paddle.empty_like(tensor_x)
        ]
        task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
        tensor_out = paddle.concat(tensor_out_list)
    out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
    out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
    assert np.array_equal(tensor_x, out_1)
    assert np.array_equal(tensor_y, out_2)
    print("test allgather api ok\n")

    if pg.rank() == 0:
        task = pg.all_gather(tensor_x, tensor_out)
        task.wait()
    # rank 1
    else:
        tensor_out_list = []
        task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
        tensor_out = paddle.concat(tensor_out_list)
    out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
    out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
    assert np.array_equal(tensor_x, out_1)
    assert np.array_equal(tensor_y, out_2)
    print("test allgather api2 ok\n")


def test_all2all(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    y = np.random.random(shape).astype(dtype)
    out1 = np.random.random(shape).astype(dtype)
    out2 = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    tensor_y = paddle.to_tensor(y)
    tensor_out1 = paddle.to_tensor(out1)
    tensor_out2 = paddle.to_tensor(out2)
    raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
    raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
    if pg.rank() == 0:
        task = pg.alltoall(tensor_x, tensor_out1)
        task.wait()
    # rank 1
    else:
        in_1, in_2 = paddle.split(tensor_y, 2)
        out_1, out_2 = paddle.split(tensor_out2, 2)
        out_tensor_list = [out_1, out_2]
        task = dist.alltoall([in_1, in_2], out_tensor_list)
        tensor_out2 = paddle.concat(out_tensor_list)
    out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
    out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
    if pg.rank() == 0:
        assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
    else:
        assert np.array_equal(out2_1, raw_tensor_x_2)
    print("test alltoall api ok\n")

    x = np.random.random(shape).astype(dtype)
    y = np.random.random(shape).astype(dtype)
    out1 = np.random.random(shape).astype(dtype)
    out2 = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    tensor_y = paddle.to_tensor(y)
    tensor_out1 = paddle.to_tensor(out1)
    tensor_out2 = paddle.to_tensor(out2)
    raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
    raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
    if pg.rank() == 0:
        task = pg.alltoall(tensor_x, tensor_out1)
        task.wait()
    # rank 1
    else:
        in_1, in_2 = paddle.split(tensor_y, 2)
        out_1, out_2 = paddle.split(tensor_out2, 2)
        out_tensor_list = []
        task = dist.alltoall([in_1, in_2], out_tensor_list)
        tensor_out2 = paddle.concat(out_tensor_list)
    out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
    out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
    if pg.rank() == 0:
        assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
    else:
        assert np.array_equal(out2_1, raw_tensor_x_2)
    print("test alltoall api2 ok\n")


def test_reduce_sum(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    y = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    tensor_y = paddle.to_tensor(y)
    sum_result = tensor_x + tensor_y
    if pg.rank() == 0:
        task = dist.reduce(tensor_x, 0, use_calc_stream=True)
    # rank 1
    else:
        task = dist.reduce(tensor_y, 0, use_calc_stream=False)
        task.wait()
    if pg.rank() == 0:
        assert np.array_equal(tensor_x, sum_result)
    print("test reduce sum api ok\n")


def test_reduce_max(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    max_result = paddle.maximum(tensor_x, tensor_y)

    if pg.rank() == 0:
        task = dist.reduce(tensor_x,
                           0,
                           dist.ReduceOp.MAX,
                           use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, max_result)
    else:
        task = dist.reduce(tensor_y,
                           0,
                           dist.ReduceOp.MAX,
                           use_calc_stream=False)
        task.wait()
    print("test reduce max api ok")


def test_reduce_min(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    min_result = paddle.minimum(tensor_x, tensor_y)

    if pg.rank() == 0:
        task = dist.reduce(tensor_x,
                           0,
                           dist.ReduceOp.MIN,
                           use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, min_result)
    else:
        task = dist.reduce(tensor_y,
                           0,
                           dist.ReduceOp.MIN,
                           use_calc_stream=False)
        task.wait()
    print("test reduce min api ok")


def test_reduce_prod(pg, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    prod_result = np.multiply(x, y)

    if pg.rank() == 0:
        task = dist.reduce(tensor_x,
                           0,
                           dist.ReduceOp.PROD,
                           use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_x, prod_result)
    else:
        task = dist.reduce(tensor_y,
                           0,
                           dist.ReduceOp.PROD,
                           use_calc_stream=False)
        task.wait()
    print("test reduce prod api ok")


def test_scatter(pg, shape, dtype):
    # rank 0
    in_shape = list(shape)
    in_shape[0] *= 2
    x = np.random.random(in_shape).astype(dtype)
    y = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    tensor_y = paddle.to_tensor(y)
    if pg.rank() == 0:
        in_1, in_2 = paddle.split(tensor_x, 2)
        task = dist.scatter(tensor_y, [in_1, in_2], 0, use_calc_stream=True)
    # rank 1
    else:
        task = dist.scatter(tensor_y, [], 0, use_calc_stream=False)
        task.wait()
    out1 = paddle.slice(tensor_x, [0], [0], [shape[0]])
    out2 = paddle.slice(tensor_x, [0], [shape[0]], [shape[0] * 2])
    if pg.rank() == 0:
        assert np.array_equal(tensor_y, out1)
    else:
        assert np.array_equal(tensor_y, out2)
    print("test scatter api ok\n")


def test_send_recv(pg, sub_group, shape, dtype):
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    if pg.rank() == 0:
        task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=False)
        task.wait()
    elif pg.rank() == 1:
        task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=False)
        task.wait()
        assert np.array_equal(tensor_y, tensor_x)

    print("test send api ok")

    # test send min
    # rank 0
    x = np.random.random(shape).astype(dtype)
    tensor_x = paddle.to_tensor(x)
    # rank 1
    y = np.random.random(shape).astype(dtype)
    tensor_y = paddle.to_tensor(y)

    if pg.rank() == 0:
        task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=True)
    elif pg.rank() == 1:
        task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=True)
        assert np.array_equal(tensor_y, tensor_x)

    print("test send api ok")


class TestProcessGroup(unittest.TestCase):

    def setUp(self):
        paddle.seed(2022)
        random.seed(2022)
        np.random.seed(2022)
        self.config()

    def config(self):
        self.dtype = "float32"
        self.shape = (2, 10, 5)

    def test_create_process_group_mpi(self):
        with _test_eager_guard():
            group = init_process_group()
            pg = group.process_group

            # test allreduce sum
            test_allreduce_sum(pg, self.shape, self.dtype)

            # test allreduce max
            test_allreduce_max(pg, self.shape, self.dtype)

            # test allreduce min
            test_allreduce_min(pg, self.shape, self.dtype)

            # test allreduce prod
            test_allreduce_prod(pg, self.shape, self.dtype)

            # test broadcast
            test_broadcast(pg, self.shape, self.dtype)

            # test barrier
            test_barrair(pg)

            # test allgather
            test_allgather(pg, self.shape, self.dtype)

            # test alltoall
            test_all2all(pg, self.shape, self.dtype)

            # test Reduce
            test_reduce_sum(pg, self.shape, self.dtype)

            # test reduce max
            test_reduce_max(pg, self.shape, self.dtype)

            # test reduce min
            test_reduce_min(pg, self.shape, self.dtype)

            # test reduce product
            test_reduce_prod(pg, self.shape, self.dtype)

            # test Scatter
            test_scatter(pg, self.shape, self.dtype)

            # test send recv.
            test_send_recv(pg, group, self.shape, self.dtype)


if __name__ == "__main__":
    unittest.main()