Mpi final dev simple (#46247)

9ce31e96 · wuhuachaocoding · GitHub · 3d656b58 · 9ce31e96 · 9ce31e96
16 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -485,6 +485,9 @@ if(WITH_DISTRIBUTE)
        ON
        CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
  endif()
+  set(WITH_MPI
+      ON
+      CACHE STRING "Enable MPI when compiling WITH_DISTRIBUTE=ON." FORCE)
  if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
    # disable WITH_PSCORE for NPU before include third_party
    message(
@@ -509,6 +512,10 @@ if(WITH_DISTRIBUTE)
  endif()
 endif()

+if(WITH_MPI)
+  include(mpi)
+endif()
+
 include(third_party
 )# download, build, install third_party, Contains about 20+ dependencies


--- a/cmake/mpi.cmake
+++ b/cmake/mpi.cmake
+if(NOT WITH_DISTRIBUTE OR NOT WITH_MPI)
+  return()
+endif()
+
+find_package(MPI)
+
+if(NOT MPI_CXX_FOUND)
+  set(WITH_MPI
+      OFF
+      CACHE STRING "Disable MPI" FORCE)
+  message(WARNING "Not found MPI support in current system")
+  return()
+endif()
+
+message(STATUS "MPI compile flags: " ${MPI_CXX_COMPILE_FLAGS})
+message(STATUS "MPI include path: " ${MPI_CXX_INCLUDE_PATH})
+message(STATUS "MPI LINK flags path: " ${MPI_CXX_LINK_FLAGS})
+message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES})
+include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH})
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_CXX_LINK_FLAGS}")
+add_definitions("-DPADDLE_WITH_MPI")
+find_program(
+  OMPI_INFO
+  NAMES ompi_info
+  HINTS ${MPI_CXX_LIBRARIES}/../bin)
+
+if(OMPI_INFO)
+  execute_process(COMMAND ${OMPI_INFO} OUTPUT_VARIABLE output_)
+  if(output_ MATCHES "smcuda")
+    #NOTE some mpi lib support mpi cuda aware.
+    add_definitions("-DPADDLE_WITH_MPI_AWARE")
+  endif()
+endif()
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -43,6 +43,13 @@ if(WITH_NCCL OR WITH_RCCL)
  endif()
 endif()

+if(WITH_MPI)
+  cc_library(
+    processgroup_mpi
+    SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
+    DEPS collective_helper device_context)
+endif()
+
 if(WITH_ASCEND_CL)
  cc_library(
    processgroup_hccl

--- a/paddle/fluid/distributed/collective/MPITools.cc
+++ b/paddle/fluid/distributed/collective/MPITools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/MPITools.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+MPI_Op ToMPIType(ReduceOp reduction) {
+  static const std::map<ReduceOp, MPI_Op> red_type = {
+      {ReduceOp::MIN, MPI_MIN},
+      {ReduceOp::MAX, MPI_MAX},
+      {ReduceOp::SUM, MPI_SUM},
+      {ReduceOp::PRODUCT, MPI_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
+                        "MPI_PROD | MPI_SUM."));
+  return it->second;
+}
+
+// NOTE: MPI dose not support CUDA aware now.
+bool CheckMpiCudaAware() { return false; }
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 1,
+      true,
+      platform::errors::InvalidArgument("the inputs size of MPI must be 1!"));
+
+  PADDLE_ENFORCE_EQ(CheckTensorsInCudaPlace(tensors) && !CheckMpiCudaAware(),
+                    false,
+                    platform::errors::InvalidArgument(
+                        "Found CUDA Tensor. But CUDA-aware MPI not support!"));
+}
+
+}  //  namespace mpi
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/MPITools.h
+++ b/paddle/fluid/distributed/collective/MPITools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <error.h>
+#include <iostream>
+#include <string>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/distributed/collective/Types.h"
+
+#ifdef HOST
+#undef HOST
+#endif
+
+#include <mpi.h>
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+#define MPI_CHECK(cmd)                                                     \
+  do {                                                                     \
+    int r = cmd;                                                           \
+    if (r != MPI_SUCCESS) {                                                \
+      LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__  \
+                 << "with error code: " << std::to_string(r) << std::endl; \
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                      \
+  } while (0)
+
+MPI_Op ToMPIType(ReduceOp reduction);
+
+bool CheckMpiCudaAware();
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors);
+
+}  // namespace mpi
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
@@ -52,5 +52,13 @@ ProcessGroup::ProcessGroup(int rank,
  }
 }

+ProcessGroup::ProcessGroup(int rank, int size, int gid)
+    : rank_(rank), size_(size), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -82,6 +82,9 @@ class ProcessGroup {
                        int size,
                        const platform::Place& place,
                        int gid);
+
+  explicit ProcessGroup(int rank, int size, int gid);
+
  virtual ~ProcessGroup() {}

  int GetRank() const { return rank_; }

--- a/paddle/fluid/distributed/collective/ProcessGroupMPI.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupMPI.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
+#include <chrono>
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+
+std::map<phi::DataType, MPI_Datatype> mpiDatatype = {
+    {phi::DataType::INT8, MPI_CHAR},
+    {phi::DataType::UINT8, MPI_UNSIGNED_CHAR},
+    {phi::DataType::FLOAT32, MPI_FLOAT},
+    {phi::DataType::FLOAT64, MPI_DOUBLE},
+    {phi::DataType::INT32, MPI_INT},
+    {phi::DataType::INT64, MPI_LONG}};
+
+void ProcessGroupMPI::MPITask::FinishMPITaskError(std::exception_ptr eptr) {
+  Finish(eptr);
+}
+
+void ProcessGroupMPI::MPITask::FinishMPITask() { Finish(); }
+
+ProcessGroupMPI::MPIAsyncTask::MPIAsyncTask(
+    MPI_Request request, const std::vector<phi::DenseTensor>& inputs)
+    : ProcessGroup::Task(-1, inputs, CommType::UNKNOWN), request_(request) {
+  memset(&status_, 0, sizeof(status_));
+}
+
+ProcessGroupMPI::MPIAsyncTask::~MPIAsyncTask() {
+  if (request_ != MPI_REQUEST_NULL) {
+    std::cerr << " Task has not completed, try to destruct async mpi task, "
+              << "exit the program." << std::endl;
+    std::terminate();
+  }
+}
+
+bool ProcessGroupMPI::MPIAsyncTask::IsCompleted() {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> lock(pg_global_mutex);
+  int flag = 0;
+  MPI_CHECK(MPI_Test(&request_, &flag, &status_));
+  if (request_ != MPI_REQUEST_NULL) {
+    return false;
+  }
+
+  if (status_.MPI_ERROR != MPI_SUCCESS) {
+    AppearException();
+  }
+
+  return true;
+}
+
+bool ProcessGroupMPI::MPIAsyncTask::Wait(std::chrono::milliseconds timeout) {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> lock(pg_global_mutex);
+  MPI_CHECK(MPI_Wait(&request_, &status_));
+
+  if (status_.MPI_ERROR != MPI_SUCCESS) {
+    AppearException();
+    std::rethrow_exception(exception_);
+    return false;
+  }
+
+  return true;
+}
+
+void ProcessGroupMPI::MPIAsyncTask::AppearException() {
+  std::array<char, MPI_MAX_ERROR_STRING> buf;
+  int len = buf.size();
+  MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
+  exception_ =
+      std::make_exception_ptr(std::runtime_error(std::string(buf.data(), len)));
+}
+
+void ProcessGroupMPI::MPIAsyncTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+
+int ProcessGroupMPI::mpi_thread_support = 0;
+std::mutex ProcessGroupMPI::pg_global_mutex;
+std::once_flag ProcessGroupMPI::onceFlag;
+
+void ProcessGroupMPI::ExitMPI() {
+  std::unique_lock<std::mutex> lock(pg_global_mutex);
+  MPI_CHECK(MPI_Finalize());
+}
+
+void ProcessGroupMPI::InitOneTimeMPI() {
+  std::call_once(onceFlag, []() {
+    MPI_CHECK(MPI_Init_thread(
+        nullptr, nullptr, MPI_THREAD_SERIALIZED, &mpi_thread_support));
+    PADDLE_ENFORCE_EQ(
+        mpi_thread_support < MPI_THREAD_SERIALIZED,
+        false,
+        platform::errors::InvalidArgument("MPI supports the number of threads "
+                                          "less than MPI_THREAD_SERIALIZED. "));
+
+    std::atexit(ProcessGroupMPI::ExitMPI);
+  });
+}
+
+std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::CreateProcessGroupMPI(
+    const std::vector<int>& ranks, int gid) {
+  InitOneTimeMPI();
+
+  MPI_Comm groupComm = MPI_COMM_WORLD;
+  int rank = -1;
+  int size = -1;
+
+  {
+    std::lock_guard<std::mutex> lock(pg_global_mutex);
+
+    if (!ranks.empty()) {
+      MPI_Group worldGroup;
+      MPI_Group ranksGroup;
+      MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
+      MPI_CHECK(
+          MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
+
+      constexpr int maxRetries = 3;
+      bool create_success = false;
+      MPI_Barrier(MPI_COMM_WORLD);
+      for (auto i = 0; i < maxRetries; i++) {
+        if (MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)) {
+          create_success = true;
+          break;
+        }
+      }
+      MPI_CHECK(create_success);
+      MPI_CHECK(MPI_Group_free(&worldGroup));
+      MPI_CHECK(MPI_Group_free(&ranksGroup));
+    }
+
+    if (groupComm != MPI_COMM_NULL) {
+      MPI_CHECK(MPI_Comm_rank(groupComm, &rank));
+      MPI_CHECK(MPI_Comm_size(groupComm, &size));
+
+      PADDLE_ENFORCE_EQ(
+          rank < 0 || size < 0,
+          false,
+          platform::errors::InvalidArgument("get world_size or rank failed!"));
+    }
+  }
+
+  if (groupComm == MPI_COMM_NULL) {
+    return std::shared_ptr<ProcessGroupMPI>();
+  }
+
+  VLOG(3) << "MPI Group Create Success! rank = " << rank << " size = " << size
+          << " group_id = " << gid;
+
+  return std::make_shared<ProcessGroupMPI>(rank, size, groupComm, gid);
+}
+
+ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pg_comm, int gid)
+    : ProcessGroup(rank, size, gid), stop_(false), pg_comm(pg_comm) {
+  PADDLE_ENFORCE_EQ(
+      pg_comm == MPI_COMM_NULL,
+      false,
+      platform::errors::InvalidArgument("Error! mpi comm is MPI_COMM_NULL!"));
+
+  worker_thread = std::thread(&ProcessGroupMPI::workLoop, this);
+}
+
+ProcessGroupMPI::~ProcessGroupMPI() {
+  std::unique_lock<std::mutex> lock(pg_mutex);
+  queue_consume.wait(lock, [&] { return queue_.empty(); });
+  stop_ = true;
+  lock.unlock();
+  queue_produce.notify_all();
+
+  worker_thread.join();
+}
+
+void ProcessGroupMPI::workLoop() {
+  std::unique_lock<std::mutex> lock(pg_mutex);
+
+  while (!stop_) {
+    if (queue_.empty()) {
+      queue_produce.wait(lock);
+      continue;
+    }
+
+    auto taskTuple = std::move(queue_.front());
+
+    queue_.pop_front();
+
+    auto& taskEntry = std::get<0>(taskTuple);
+    auto& task = std::get<1>(taskTuple);
+
+    lock.unlock();
+    queue_consume.notify_one();
+
+    try {
+      taskEntry->run_(taskEntry);
+      task->FinishMPITask();
+    } catch (...) {
+      task->FinishMPITaskError(std::current_exception());
+    }
+
+    lock.lock();
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Enqueue(
+    std::unique_ptr<TaskEntry> entry,
+    const std::vector<phi::DenseTensor>& inputs) {
+  auto task = std::make_shared<MPITask>(entry->dst_, inputs);
+  std::unique_lock<std::mutex> lock(pg_mutex);
+  queue_.push_back(std::make_tuple(std::move(entry), task));
+  lock.unlock();
+  queue_produce.notify_one();
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts) {
+  mpi::CheckValidInputs(in_tensors);
+  const auto places = GetPlaceList(in_tensors);
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<TaskEntry>& entry) {
+        auto data = (entry->src_)[0];
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        const auto root = opts.source_rank + opts.source_root;
+        MPI_CHECK(MPI_Bcast(data.data(),
+                            data.numel(),
+                            mpiDatatype.at(data.dtype()),
+                            root,
+                            pg_comm));
+      };
+  auto entry = std::make_unique<TaskEntry>(
+      &in_tensors, &out_tensors, std::move(runFunc));
+  return Enqueue(std::move(entry), in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const AllreduceOptions& opts) {
+  mpi::CheckValidInputs(in_tensors);
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<TaskEntry>& entry) {
+        auto data = (entry->src_)[0];
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE,
+                                data.data(),
+                                data.numel(),
+                                mpiDatatype.at(data.dtype()),
+                                mpi::ToMPIType(opts.reduce_op),
+                                pg_comm));
+      };
+  auto entry = std::make_unique<TaskEntry>(
+      &in_tensors, &out_tensors, std::move(runFunc));
+  return Enqueue(std::move(entry), in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Barrier(
+    const BarrierOptions& opts) {
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [this](std::unique_ptr<TaskEntry>& entry) {
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Barrier(pg_comm));
+      };
+  auto entry =
+      std::make_unique<TaskEntry>(nullptr, nullptr, std::move(runFunc));
+  return Enqueue(std::move(entry), std::vector<phi::DenseTensor>{});
+}
+
+// NOTE: MPI_send tag set gid_
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Send(
+    std::vector<phi::DenseTensor>& tensors, int dst_rank) {
+  mpi::CheckValidInputs(tensors);
+
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> lock(pg_global_mutex);
+    MPI_CHECK(MPI_Isend(tensor.data(),
+                        tensor.numel(),
+                        mpiDatatype.at(tensor.dtype()),
+                        dst_rank,
+                        this->gid_,
+                        pg_comm,
+                        &request));
+  }
+
+  return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Recv(
+    std::vector<phi::DenseTensor>& tensors, int src_rank) {
+  mpi::CheckValidInputs(tensors);
+
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> lock(pg_global_mutex);
+    MPI_CHECK(MPI_Irecv(tensor.data(),
+                        tensor.numel(),
+                        mpiDatatype.at(tensor.dtype()),
+                        src_rank,
+                        this->gid_,
+                        pg_comm,
+                        &request));
+  }
+
+  return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  mpi::CheckValidInputs(in_tensors);
+
+  PADDLE_ENFORCE_EQ(out_tensors.size() == 1,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "MPI only support a single tensor op."));
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [this](std::unique_ptr<TaskEntry>& entry) {
+        auto data = (entry->src_)[0];
+        std::vector<phi::DenseTensor> dst = entry->dst_;
+
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Allgather(data.data(),
+                                data.numel(),
+                                mpiDatatype.at(data.dtype()),
+                                dst[0].data(),
+                                data.numel(),
+                                mpiDatatype.at(data.dtype()),
+                                pg_comm));
+      };
+
+  auto entry = std::make_unique<TaskEntry>(
+      &in_tensors, &out_tensors, std::move(runFunc));
+
+  return Enqueue(std::move(entry), in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  mpi::CheckValidInputs(in_tensors);
+  mpi::CheckValidInputs(out_tensors);
+
+  PADDLE_ENFORCE_EQ(in_tensors[0].numel() == out_tensors[0].numel() &&
+                        in_tensors[0].dtype() == out_tensors[0].dtype(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "MPI AlltoAll: input and output are not equal in "
+                        "size or data type."));
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [this](std::unique_ptr<TaskEntry>& entry) {
+        auto srcdata = (entry->src_)[0];
+        auto dstdata = (entry->dst_)[0];
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Alltoall(srcdata.data(),
+                               srcdata.numel() / size_,
+                               mpiDatatype.at(srcdata.dtype()),
+                               dstdata.data(),
+                               dstdata.numel() / size_,
+                               mpiDatatype.at(dstdata.dtype()),
+                               pg_comm));
+      };
+  auto entry = std::make_unique<TaskEntry>(
+      &in_tensors, &out_tensors, std::move(runFunc));
+
+  return Enqueue(std::move(entry), in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Reduce(
+    std::vector<phi::DenseTensor>& tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts) {
+  mpi::CheckValidInputs(tensors);
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<TaskEntry>& entry) {
+        auto data = (entry->src_)[0];
+        auto dataPtr = (entry->src_)[0].data();
+        void* sendbuf = (rank_ == opts.root_rank) ? MPI_IN_PLACE : dataPtr;
+        void* recvbuf = (rank_ == opts.root_rank) ? dataPtr : nullptr;
+
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Reduce(sendbuf,
+                             recvbuf,
+                             data.numel(),
+                             mpiDatatype.at(data.dtype()),
+                             mpi::ToMPIType(opts.reduce_op),
+                             opts.root_rank,
+                             pg_comm));
+      };
+  auto entry =
+      std::make_unique<TaskEntry>(&tensors, &tensors, std::move(runFunc));
+  return Enqueue(std::move(entry), tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts) {
+  mpi::CheckValidInputs(in_tensors);
+
+  std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<TaskEntry>& entry) {
+        auto data = (entry->dst_)[0];
+        void* sendbuf = nullptr;
+
+        if (rank_ == opts.root_rank) {
+          std::vector<phi::DenseTensor>& inputData = entry->src_;
+          sendbuf = inputData[0].data();
+        }
+
+        std::unique_lock<std::mutex> lock(pg_global_mutex);
+        MPI_CHECK(MPI_Scatter(sendbuf,
+                              data.numel(),
+                              mpiDatatype.at(data.dtype()),
+                              data.data(),
+                              data.numel(),
+                              mpiDatatype.at(data.dtype()),
+                              opts.root_rank,
+                              pg_comm));
+      };
+
+  if (rank_ == opts.root_rank) {
+    auto entry = std::make_unique<TaskEntry>(
+        &in_tensors, &out_tensors, std::move(runFunc));
+    return Enqueue(std::move(entry), in_tensors);
+  } else {
+    auto entry =
+        std::make_unique<TaskEntry>(nullptr, &out_tensors, std::move(runFunc));
+    return Enqueue(std::move(entry), in_tensors);
+  }
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupMPI.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupMPI.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <condition_variable>
+#include <deque>
+#include <exception>
+#include <mutex>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#if defined(PADDLE_WITH_MPI)
+#include "paddle/fluid/distributed/collective/MPITools.h"
+#endif
+
+constexpr const char* MPI_BACKEND_NAME = "MPI";
+
+namespace paddle {
+namespace distributed {
+
+struct TaskEntry {
+  explicit TaskEntry(std::vector<phi::DenseTensor>* src_ptr,
+                     std::vector<phi::DenseTensor>* dst_ptr,
+                     std::function<void(std::unique_ptr<TaskEntry>&)> run)
+      : dst_(dst_ptr ? *dst_ptr : std::vector<phi::DenseTensor>()),
+        run_(std::move(run)) {
+    if (src_ptr) {
+      src_ = *src_ptr;
+    }
+  }
+
+  TaskEntry(const TaskEntry&) = delete;
+  TaskEntry& operator=(const TaskEntry&) = delete;
+
+  std::vector<phi::DenseTensor> src_;
+  std::vector<phi::DenseTensor> dst_;
+
+  int* srcRank_ = nullptr;
+  std::function<void(std::unique_ptr<TaskEntry>&)> run_;
+};
+
+class ProcessGroupMPI : public ProcessGroup {
+ public:
+  class MPITask : public ProcessGroup::Task {
+   public:
+    explicit MPITask(std::vector<phi::DenseTensor> outputTensors,
+                     const std::vector<phi::DenseTensor>& inputTensors)
+        : ProcessGroup::Task(-1, inputTensors, CommType::UNKNOWN),
+          outputs_(std::move(outputTensors)) {}
+
+    void Synchronize() { Wait(); }
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (timeout == kWaitTimeout) {
+        // This waits without a timeout.
+        cv_.wait(lock, [&] { return is_completed_; });
+      } else {
+        // Waits for the user-provided timeout.
+        cv_.wait_for(lock, timeout, [&] { return is_completed_; });
+        PADDLE_ENFORCE_EQ(
+            is_completed_,
+            true,
+            platform::errors::InvalidArgument("MPI operation timeout! "));
+      }
+      if (exception_) {
+        std::rethrow_exception(exception_);
+      }
+      return true;
+    }
+
+   protected:
+    friend class ProcessGroupMPI;
+
+   private:
+    // about mpi
+    void Finish(std::exception_ptr exception = nullptr) {
+      is_completed_ = true;
+      exception_ = exception;
+      cv_.notify_all();
+    }
+    void FinishMPITask();
+    void FinishMPITaskError(std::exception_ptr eptr);
+
+    std::vector<phi::DenseTensor> outputs_;
+    std::condition_variable cv_;
+    std::exception_ptr exception_;
+  };
+
+ public:
+  class MPIAsyncTask : public ProcessGroup::Task {
+   public:
+    MPIAsyncTask(MPI_Request request,
+                 const std::vector<phi::DenseTensor>& inputs);
+
+    bool IsCompleted();
+
+    void Synchronize() {}
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+
+    virtual ~MPIAsyncTask();
+
+   protected:
+    void AppearException();
+
+   private:
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+    MPI_Request request_;
+    MPI_Status status_;
+    std::exception_ptr exception_;
+  };
+
+  ProcessGroupMPI(int rank, int size, MPI_Comm pgComm, int gid);
+
+  virtual ~ProcessGroupMPI();
+
+  const std::string GetBackendName() const override {
+    return std::string(MPI_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors, int src_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in,
+      std::vector<phi::DenseTensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions&) override;
+
+  static std::shared_ptr<ProcessGroupMPI> CreateProcessGroupMPI(
+      const std::vector<int>& ranks, int gid);
+
+ protected:
+  void workLoop();
+
+  std::shared_ptr<ProcessGroup::Task> Enqueue(
+      std::unique_ptr<TaskEntry> entry,
+      const std::vector<phi::DenseTensor>& inputs);
+
+ private:
+  bool stop_{false};
+  std::mutex pg_mutex;
+  std::thread worker_thread;
+  std::deque<std::tuple<std::unique_ptr<TaskEntry>, std::shared_ptr<MPITask>>>
+      queue_;
+  std::condition_variable queue_produce;
+  std::condition_variable queue_consume;
+
+  static void InitOneTimeMPI();
+  static void ExitMPI();
+  static std::once_flag onceFlag;
+
+  static std::mutex pg_global_mutex;
+  static int mpi_thread_support;
+
+  MPI_Comm pg_comm;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -151,6 +151,9 @@ if(WITH_PYTHON)
  if(WITH_GLOO)
    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
  endif()
+  if(WITH_MPI)
+    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_mpi)
+  endif()
  if(WITH_ASCEND_CL)
    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
    if(WITH_PSCORE)
@@ -591,6 +594,10 @@ if(WITH_PYTHON)
    target_link_libraries(libpaddle ${ROCM_HIPRTC_LIB})
  endif()

+  if(WITH_MPI)
+    target_link_libraries(libpaddle ${MPI_CXX_LIBRARIES})
+  endif()
+
  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(libpaddle ${os_dependency_modules})
  add_dependencies(libpaddle op_function_generator_cmd)

--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif

+#if defined(PADDLE_WITH_MPI)
+#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
+#endif
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
 #endif
@@ -623,6 +627,25 @@ void BindDistributed(py::module *m) {

 #endif

+#if defined(PADDLE_WITH_MPI)
+  py::class_<distributed::ProcessGroupMPI,
+             std::shared_ptr<distributed::ProcessGroupMPI>>(
+      *m, "ProcessGroupMPI", ProcessGroup)
+      .def_static(
+          "create",
+          [](const std::vector<int> &ranks,
+             int gid) -> std::shared_ptr<distributed::ProcessGroupMPI> {
+            return paddle::distributed::ProcessGroupMPI::CreateProcessGroupMPI(
+                ranks, gid);
+          })
+      .def("get_rank",
+           &distributed::ProcessGroup::GetRank,
+           py::call_guard<py::gil_scoped_release>())
+      .def("get_world_size",
+           &distributed::ProcessGroup::GetSize,
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
 #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
  py::class_<distributed::ProcessGroupHeter,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -229,6 +229,23 @@ bool IsCompiledWithNCCL() {
 #endif
 }

+bool IsCompiledWithMPI() {
+#ifdef PADDLE_WITH_MPI
+  return true;
+#else
+  return false;
+#endif
+}
+
+// NOTE some mpi lib can support cuda aware, support it in the future.
+bool IsCompiledWithMPIAWARE() {
+#ifdef PADDLE_WITH_MPI_AWARE
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool IsCompiledWithROCM() {
 #ifndef PADDLE_WITH_HIP
  return false;
@@ -1718,6 +1735,8 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_nccl", IsCompiledWithNCCL);
+  m.def("is_compiled_with_mpi", IsCompiledWithMPI);
+  m.def("is_compiled_with_mpi_aware", IsCompiledWithMPIAWARE);
  m.def("is_compiled_with_cinn", IsCompiledWithCINN);
  m.def("is_compiled_with_mlu", IsCompiledWithMLU);
  m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);

--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -323,5 +323,17 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX))
    "PADDLE_DIST_UT_PORT=21532;http_proxy=;https_proxy=")
  set_tests_properties(test_world_size_and_rank PROPERTIES TIMEOUT "120")
 endif()
+if(WITH_MPI)
+  if(LOCAL_ALL_ARCH AND (LINUX))
+    bash_test_modules(
+      test_mpi_comm
+      START_BASH
+      test_mpi_comm.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21672;http_proxy=;https_proxy=")
+  endif()
+endif()
 add_subdirectory(fleet)
 add_subdirectory(multinode)
--- a/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_mpi.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed.collective import Group
+from paddle.distributed.collective import _group_map_by_name
+from paddle.distributed.collective import _default_group_name
+from paddle.distributed.collective import _set_group_map
+from paddle.distributed.collective import _set_group_map_by_name
+from paddle.distributed.collective import _set_group_map_backend
+from paddle.fluid.framework import _set_expected_place
+import paddle.distributed as dist
+import ctypes
+
+ctypes.CDLL("libmpi.so", mode=ctypes.RTLD_GLOBAL)
+
+
+def init_process_group(strategy=None):
+    gid = 0
+    pg = core.ProcessGroupMPI.create([], gid)
+    rank = pg.get_rank()
+    world_size = pg.get_world_size()
+
+    # support CPU
+    place = core.CPUPlace()
+    _set_expected_place(place)
+
+    group = Group(rank,
+                  world_size,
+                  id=0,
+                  ranks=list(range(world_size)),
+                  pg=pg,
+                  name=_default_group_name)
+    _set_group_map_by_name(_default_group_name, group)
+    _set_group_map(gid, group)
+    _set_group_map_backend(group, "mpi")
+
+    return group
+
+
+def test_allreduce_sum(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    sum_result = tensor_x + tensor_y
+    if pg.rank() == 0:
+        task = dist.all_reduce(tensor_x)
+        assert np.array_equal(tensor_x, sum_result)
+    else:
+        task = dist.all_reduce(tensor_y)
+        assert np.array_equal(tensor_y, sum_result)
+    print("test allreduce sum api ok")
+
+
+def test_allreduce_max(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    max_result = paddle.maximum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.all_reduce(tensor_x,
+                               dist.ReduceOp.MAX,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, max_result)
+    else:
+        task = dist.all_reduce(tensor_y,
+                               dist.ReduceOp.MAX,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_y, max_result)
+    print("test allreduce max api ok")
+
+
+def test_allreduce_min(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    min_result = paddle.minimum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.all_reduce(tensor_x,
+                               dist.ReduceOp.MIN,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, min_result)
+    else:
+        task = dist.all_reduce(tensor_y,
+                               dist.ReduceOp.MIN,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_y, min_result)
+    print("test allreduce min api ok")
+
+
+def test_allreduce_prod(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    prod_result = np.multiply(x, y)
+
+    if pg.rank() == 0:
+        task = dist.all_reduce(tensor_x,
+                               dist.ReduceOp.PROD,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, prod_result)
+    else:
+        task = dist.all_reduce(tensor_y,
+                               dist.ReduceOp.PROD,
+                               use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_y, prod_result)
+    print("test allreduce prod api ok")
+
+
+def test_broadcast(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    broadcast_result = paddle.assign(tensor_x)
+    if pg.rank() == 0:
+        task = dist.broadcast(tensor_x, 0, use_calc_stream=False)
+        task.synchronize()
+        assert task.is_completed()
+        assert np.array_equal(broadcast_result, tensor_x)
+    else:
+        task = dist.broadcast(tensor_y, 0)
+        assert np.array_equal(broadcast_result, tensor_y)
+    print("test broadcast api ok")
+
+
+def test_barrair(pg):
+    # rank 0
+    if pg.rank() == 0:
+        dist.barrier()
+    # rank 1
+    else:
+        task = pg.barrier()
+        task.wait()
+    print("test barrier api ok\n")
+
+
+def test_allgather(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    out_shape = list(shape)
+    out_shape[0] *= 2
+    out = np.random.random(out_shape).astype(dtype)
+    tensor_out = paddle.to_tensor(out)
+    if pg.rank() == 0:
+        task = pg.all_gather(tensor_x, tensor_out)
+        task.wait()
+    # rank 1
+    else:
+        tensor_out_list = [
+            paddle.empty_like(tensor_x),
+            paddle.empty_like(tensor_x)
+        ]
+        task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
+        tensor_out = paddle.concat(tensor_out_list)
+    out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+    out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
+    assert np.array_equal(tensor_x, out_1)
+    assert np.array_equal(tensor_y, out_2)
+    print("test allgather api ok\n")
+
+    if pg.rank() == 0:
+        task = pg.all_gather(tensor_x, tensor_out)
+        task.wait()
+    # rank 1
+    else:
+        tensor_out_list = []
+        task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
+        tensor_out = paddle.concat(tensor_out_list)
+    out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+    out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
+    assert np.array_equal(tensor_x, out_1)
+    assert np.array_equal(tensor_y, out_2)
+    print("test allgather api2 ok\n")
+
+
+def test_all2all(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    out1 = np.random.random(shape).astype(dtype)
+    out2 = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    tensor_out1 = paddle.to_tensor(out1)
+    tensor_out2 = paddle.to_tensor(out2)
+    raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
+    raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
+    if pg.rank() == 0:
+        task = pg.alltoall(tensor_x, tensor_out1)
+        task.wait()
+    # rank 1
+    else:
+        in_1, in_2 = paddle.split(tensor_y, 2)
+        out_1, out_2 = paddle.split(tensor_out2, 2)
+        out_tensor_list = [out_1, out_2]
+        task = dist.alltoall([in_1, in_2], out_tensor_list)
+        tensor_out2 = paddle.concat(out_tensor_list)
+    out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
+    out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
+    if pg.rank() == 0:
+        assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+    else:
+        assert np.array_equal(out2_1, raw_tensor_x_2)
+    print("test alltoall api ok\n")
+
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    out1 = np.random.random(shape).astype(dtype)
+    out2 = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    tensor_out1 = paddle.to_tensor(out1)
+    tensor_out2 = paddle.to_tensor(out2)
+    raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
+    raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
+    if pg.rank() == 0:
+        task = pg.alltoall(tensor_x, tensor_out1)
+        task.wait()
+    # rank 1
+    else:
+        in_1, in_2 = paddle.split(tensor_y, 2)
+        out_1, out_2 = paddle.split(tensor_out2, 2)
+        out_tensor_list = []
+        task = dist.alltoall([in_1, in_2], out_tensor_list)
+        tensor_out2 = paddle.concat(out_tensor_list)
+    out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
+    out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
+    if pg.rank() == 0:
+        assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+    else:
+        assert np.array_equal(out2_1, raw_tensor_x_2)
+    print("test alltoall api2 ok\n")
+
+
+def test_reduce_sum(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    sum_result = tensor_x + tensor_y
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, use_calc_stream=True)
+    # rank 1
+    else:
+        task = dist.reduce(tensor_y, 0, use_calc_stream=False)
+        task.wait()
+    if pg.rank() == 0:
+        assert np.array_equal(tensor_x, sum_result)
+    print("test reduce sum api ok\n")
+
+
+def test_reduce_max(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    max_result = paddle.maximum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x,
+                           0,
+                           dist.ReduceOp.MAX,
+                           use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, max_result)
+    else:
+        task = dist.reduce(tensor_y,
+                           0,
+                           dist.ReduceOp.MAX,
+                           use_calc_stream=False)
+        task.wait()
+    print("test reduce max api ok")
+
+
+def test_reduce_min(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    min_result = paddle.minimum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x,
+                           0,
+                           dist.ReduceOp.MIN,
+                           use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, min_result)
+    else:
+        task = dist.reduce(tensor_y,
+                           0,
+                           dist.ReduceOp.MIN,
+                           use_calc_stream=False)
+        task.wait()
+    print("test reduce min api ok")
+
+
+def test_reduce_prod(pg, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    prod_result = np.multiply(x, y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x,
+                           0,
+                           dist.ReduceOp.PROD,
+                           use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_x, prod_result)
+    else:
+        task = dist.reduce(tensor_y,
+                           0,
+                           dist.ReduceOp.PROD,
+                           use_calc_stream=False)
+        task.wait()
+    print("test reduce prod api ok")
+
+
+def test_scatter(pg, shape, dtype):
+    # rank 0
+    in_shape = list(shape)
+    in_shape[0] *= 2
+    x = np.random.random(in_shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    if pg.rank() == 0:
+        in_1, in_2 = paddle.split(tensor_x, 2)
+        task = dist.scatter(tensor_y, [in_1, in_2], 0, use_calc_stream=True)
+    # rank 1
+    else:
+        task = dist.scatter(tensor_y, [], 0, use_calc_stream=False)
+        task.wait()
+    out1 = paddle.slice(tensor_x, [0], [0], [shape[0]])
+    out2 = paddle.slice(tensor_x, [0], [shape[0]], [shape[0] * 2])
+    if pg.rank() == 0:
+        assert np.array_equal(tensor_y, out1)
+    else:
+        assert np.array_equal(tensor_y, out2)
+    print("test scatter api ok\n")
+
+
+def test_send_recv(pg, sub_group, shape, dtype):
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    if pg.rank() == 0:
+        task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=False)
+        task.wait()
+    elif pg.rank() == 1:
+        task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=False)
+        task.wait()
+        assert np.array_equal(tensor_y, tensor_x)
+
+    print("test send api ok")
+
+    # test send min
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    if pg.rank() == 0:
+        task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=True)
+    elif pg.rank() == 1:
+        task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=True)
+        assert np.array_equal(tensor_y, tensor_x)
+
+    print("test send api ok")
+
+
+class TestProcessGroup(unittest.TestCase):
+
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_mpi(self):
+        with _test_eager_guard():
+            group = init_process_group()
+            pg = group.process_group
+
+            # test allreduce sum
+            test_allreduce_sum(pg, self.shape, self.dtype)
+
+            # test allreduce max
+            test_allreduce_max(pg, self.shape, self.dtype)
+
+            # test allreduce min
+            test_allreduce_min(pg, self.shape, self.dtype)
+
+            # test allreduce prod
+            test_allreduce_prod(pg, self.shape, self.dtype)
+
+            # test broadcast
+            test_broadcast(pg, self.shape, self.dtype)
+
+            # test barrier
+            test_barrair(pg)
+
+            # test allgather
+            test_allgather(pg, self.shape, self.dtype)
+
+            # test alltoall
+            test_all2all(pg, self.shape, self.dtype)
+
+            # test Reduce
+            test_reduce_sum(pg, self.shape, self.dtype)
+
+            # test reduce max
+            test_reduce_max(pg, self.shape, self.dtype)
+
+            # test reduce min
+            test_reduce_min(pg, self.shape, self.dtype)
+
+            # test reduce product
+            test_reduce_prod(pg, self.shape, self.dtype)
+
+            # test Scatter
+            test_scatter(pg, self.shape, self.dtype)
+
+            # test send recv.
+            test_send_recv(pg, group, self.shape, self.dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/collective/test_mpi_comm.sh
+++ b/python/paddle/fluid/tests/unittests/collective/test_mpi_comm.sh
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# use default values
+export PADDLE_DISTRI_BACKEND="mpi"
+cmd=`which mpirun`
+if [ ${#cmd} -eq 0 ]
+then
+    echo "Warning! mpirun command not found!"
+else
+    ${cmd} -x PADDLE_DISTRI_BACKEND -np 2 --allow-run-as-root python3.8 process_group_mpi.py
+fi
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -38,3 +38,4 @@ test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_
 test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
+test_mpi_comm,linux,,,DIST,test_mpi_comm.sh,2,,http_proxy=;https_proxy=,WITH_MPI