未验证 提交 9ce31e96 编写于 作者: W wuhuachaocoding 提交者: GitHub

Mpi final dev simple (#46247)

上级 3d656b58
...@@ -485,6 +485,9 @@ if(WITH_DISTRIBUTE) ...@@ -485,6 +485,9 @@ if(WITH_DISTRIBUTE)
ON ON
CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
endif() endif()
set(WITH_MPI
ON
CACHE STRING "Enable MPI when compiling WITH_DISTRIBUTE=ON." FORCE)
if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC) if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
# disable WITH_PSCORE for NPU before include third_party # disable WITH_PSCORE for NPU before include third_party
message( message(
...@@ -509,6 +512,10 @@ if(WITH_DISTRIBUTE) ...@@ -509,6 +512,10 @@ if(WITH_DISTRIBUTE)
endif() endif()
endif() endif()
if(WITH_MPI)
include(mpi)
endif()
include(third_party include(third_party
)# download, build, install third_party, Contains about 20+ dependencies )# download, build, install third_party, Contains about 20+ dependencies
......
if(NOT WITH_DISTRIBUTE OR NOT WITH_MPI)
return()
endif()
find_package(MPI)
if(NOT MPI_CXX_FOUND)
set(WITH_MPI
OFF
CACHE STRING "Disable MPI" FORCE)
message(WARNING "Not found MPI support in current system")
return()
endif()
message(STATUS "MPI compile flags: " ${MPI_CXX_COMPILE_FLAGS})
message(STATUS "MPI include path: " ${MPI_CXX_INCLUDE_PATH})
message(STATUS "MPI LINK flags path: " ${MPI_CXX_LINK_FLAGS})
message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES})
include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH})
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_CXX_LINK_FLAGS}")
add_definitions("-DPADDLE_WITH_MPI")
find_program(
OMPI_INFO
NAMES ompi_info
HINTS ${MPI_CXX_LIBRARIES}/../bin)
if(OMPI_INFO)
execute_process(COMMAND ${OMPI_INFO} OUTPUT_VARIABLE output_)
if(output_ MATCHES "smcuda")
#NOTE some mpi lib support mpi cuda aware.
add_definitions("-DPADDLE_WITH_MPI_AWARE")
endif()
endif()
...@@ -43,6 +43,13 @@ if(WITH_NCCL OR WITH_RCCL) ...@@ -43,6 +43,13 @@ if(WITH_NCCL OR WITH_RCCL)
endif() endif()
endif() endif()
if(WITH_MPI)
cc_library(
processgroup_mpi
SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
DEPS collective_helper device_context)
endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
cc_library( cc_library(
processgroup_hccl processgroup_hccl
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/MPITools.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace paddle {
namespace distributed {
namespace mpi {
MPI_Op ToMPIType(ReduceOp reduction) {
static const std::map<ReduceOp, MPI_Op> red_type = {
{ReduceOp::MIN, MPI_MIN},
{ReduceOp::MAX, MPI_MAX},
{ReduceOp::SUM, MPI_SUM},
{ReduceOp::PRODUCT, MPI_PROD},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(it != red_type.end(),
true,
platform::errors::InvalidArgument(
"Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
"MPI_PROD | MPI_SUM."));
return it->second;
}
// NOTE: MPI dose not support CUDA aware now.
bool CheckMpiCudaAware() { return false; }
void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors) {
PADDLE_ENFORCE_EQ(
tensors.size() == 1,
true,
platform::errors::InvalidArgument("the inputs size of MPI must be 1!"));
PADDLE_ENFORCE_EQ(CheckTensorsInCudaPlace(tensors) && !CheckMpiCudaAware(),
false,
platform::errors::InvalidArgument(
"Found CUDA Tensor. But CUDA-aware MPI not support!"));
}
} // namespace mpi
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <iostream>
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/distributed/collective/Types.h"
#ifdef HOST
#undef HOST
#endif
#include <mpi.h>
namespace paddle {
namespace distributed {
namespace mpi {
#define MPI_CHECK(cmd) \
do { \
int r = cmd; \
if (r != MPI_SUCCESS) { \
LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__ \
<< "with error code: " << std::to_string(r) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
MPI_Op ToMPIType(ReduceOp reduction);
bool CheckMpiCudaAware();
void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors);
} // namespace mpi
} // namespace distributed
} // namespace paddle
...@@ -52,5 +52,13 @@ ProcessGroup::ProcessGroup(int rank, ...@@ -52,5 +52,13 @@ ProcessGroup::ProcessGroup(int rank,
} }
} }
ProcessGroup::ProcessGroup(int rank, int size, int gid)
: rank_(rank), size_(size), gid_(gid) {
if (gid != IGNORE_ID) {
auto map = ProcessGroupMapFromGid::getInstance();
map->insert(gid_, this);
}
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -82,6 +82,9 @@ class ProcessGroup { ...@@ -82,6 +82,9 @@ class ProcessGroup {
int size, int size,
const platform::Place& place, const platform::Place& place,
int gid); int gid);
explicit ProcessGroup(int rank, int size, int gid);
virtual ~ProcessGroup() {} virtual ~ProcessGroup() {}
int GetRank() const { return rank_; } int GetRank() const { return rank_; }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
#include <chrono>
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
std::map<phi::DataType, MPI_Datatype> mpiDatatype = {
{phi::DataType::INT8, MPI_CHAR},
{phi::DataType::UINT8, MPI_UNSIGNED_CHAR},
{phi::DataType::FLOAT32, MPI_FLOAT},
{phi::DataType::FLOAT64, MPI_DOUBLE},
{phi::DataType::INT32, MPI_INT},
{phi::DataType::INT64, MPI_LONG}};
void ProcessGroupMPI::MPITask::FinishMPITaskError(std::exception_ptr eptr) {
Finish(eptr);
}
void ProcessGroupMPI::MPITask::FinishMPITask() { Finish(); }
ProcessGroupMPI::MPIAsyncTask::MPIAsyncTask(
MPI_Request request, const std::vector<phi::DenseTensor>& inputs)
: ProcessGroup::Task(-1, inputs, CommType::UNKNOWN), request_(request) {
memset(&status_, 0, sizeof(status_));
}
ProcessGroupMPI::MPIAsyncTask::~MPIAsyncTask() {
if (request_ != MPI_REQUEST_NULL) {
std::cerr << " Task has not completed, try to destruct async mpi task, "
<< "exit the program." << std::endl;
std::terminate();
}
}
bool ProcessGroupMPI::MPIAsyncTask::IsCompleted() {
if (request_ == MPI_REQUEST_NULL) {
return true;
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
int flag = 0;
MPI_CHECK(MPI_Test(&request_, &flag, &status_));
if (request_ != MPI_REQUEST_NULL) {
return false;
}
if (status_.MPI_ERROR != MPI_SUCCESS) {
AppearException();
}
return true;
}
bool ProcessGroupMPI::MPIAsyncTask::Wait(std::chrono::milliseconds timeout) {
if (request_ == MPI_REQUEST_NULL) {
return true;
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Wait(&request_, &status_));
if (status_.MPI_ERROR != MPI_SUCCESS) {
AppearException();
std::rethrow_exception(exception_);
return false;
}
return true;
}
void ProcessGroupMPI::MPIAsyncTask::AppearException() {
std::array<char, MPI_MAX_ERROR_STRING> buf;
int len = buf.size();
MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
exception_ =
std::make_exception_ptr(std::runtime_error(std::string(buf.data(), len)));
}
void ProcessGroupMPI::MPIAsyncTask::SetOutputs(
std::vector<phi::DenseTensor>& outputs) {
outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
}
int ProcessGroupMPI::mpi_thread_support = 0;
std::mutex ProcessGroupMPI::pg_global_mutex;
std::once_flag ProcessGroupMPI::onceFlag;
void ProcessGroupMPI::ExitMPI() {
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Finalize());
}
void ProcessGroupMPI::InitOneTimeMPI() {
std::call_once(onceFlag, []() {
MPI_CHECK(MPI_Init_thread(
nullptr, nullptr, MPI_THREAD_SERIALIZED, &mpi_thread_support));
PADDLE_ENFORCE_EQ(
mpi_thread_support < MPI_THREAD_SERIALIZED,
false,
platform::errors::InvalidArgument("MPI supports the number of threads "
"less than MPI_THREAD_SERIALIZED. "));
std::atexit(ProcessGroupMPI::ExitMPI);
});
}
std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::CreateProcessGroupMPI(
const std::vector<int>& ranks, int gid) {
InitOneTimeMPI();
MPI_Comm groupComm = MPI_COMM_WORLD;
int rank = -1;
int size = -1;
{
std::lock_guard<std::mutex> lock(pg_global_mutex);
if (!ranks.empty()) {
MPI_Group worldGroup;
MPI_Group ranksGroup;
MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
MPI_CHECK(
MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
constexpr int maxRetries = 3;
bool create_success = false;
MPI_Barrier(MPI_COMM_WORLD);
for (auto i = 0; i < maxRetries; i++) {
if (MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)) {
create_success = true;
break;
}
}
MPI_CHECK(create_success);
MPI_CHECK(MPI_Group_free(&worldGroup));
MPI_CHECK(MPI_Group_free(&ranksGroup));
}
if (groupComm != MPI_COMM_NULL) {
MPI_CHECK(MPI_Comm_rank(groupComm, &rank));
MPI_CHECK(MPI_Comm_size(groupComm, &size));
PADDLE_ENFORCE_EQ(
rank < 0 || size < 0,
false,
platform::errors::InvalidArgument("get world_size or rank failed!"));
}
}
if (groupComm == MPI_COMM_NULL) {
return std::shared_ptr<ProcessGroupMPI>();
}
VLOG(3) << "MPI Group Create Success! rank = " << rank << " size = " << size
<< " group_id = " << gid;
return std::make_shared<ProcessGroupMPI>(rank, size, groupComm, gid);
}
ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pg_comm, int gid)
: ProcessGroup(rank, size, gid), stop_(false), pg_comm(pg_comm) {
PADDLE_ENFORCE_EQ(
pg_comm == MPI_COMM_NULL,
false,
platform::errors::InvalidArgument("Error! mpi comm is MPI_COMM_NULL!"));
worker_thread = std::thread(&ProcessGroupMPI::workLoop, this);
}
ProcessGroupMPI::~ProcessGroupMPI() {
std::unique_lock<std::mutex> lock(pg_mutex);
queue_consume.wait(lock, [&] { return queue_.empty(); });
stop_ = true;
lock.unlock();
queue_produce.notify_all();
worker_thread.join();
}
void ProcessGroupMPI::workLoop() {
std::unique_lock<std::mutex> lock(pg_mutex);
while (!stop_) {
if (queue_.empty()) {
queue_produce.wait(lock);
continue;
}
auto taskTuple = std::move(queue_.front());
queue_.pop_front();
auto& taskEntry = std::get<0>(taskTuple);
auto& task = std::get<1>(taskTuple);
lock.unlock();
queue_consume.notify_one();
try {
taskEntry->run_(taskEntry);
task->FinishMPITask();
} catch (...) {
task->FinishMPITaskError(std::current_exception());
}
lock.lock();
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Enqueue(
std::unique_ptr<TaskEntry> entry,
const std::vector<phi::DenseTensor>& inputs) {
auto task = std::make_shared<MPITask>(entry->dst_, inputs);
std::unique_lock<std::mutex> lock(pg_mutex);
queue_.push_back(std::make_tuple(std::move(entry), task));
lock.unlock();
queue_produce.notify_one();
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts) {
mpi::CheckValidInputs(in_tensors);
const auto places = GetPlaceList(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
const auto root = opts.source_rank + opts.source_root;
MPI_CHECK(MPI_Bcast(data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
root,
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& opts) {
mpi::CheckValidInputs(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE,
data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
mpi::ToMPIType(opts.reduce_op),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Barrier(
const BarrierOptions& opts) {
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Barrier(pg_comm));
};
auto entry =
std::make_unique<TaskEntry>(nullptr, nullptr, std::move(runFunc));
return Enqueue(std::move(entry), std::vector<phi::DenseTensor>{});
}
// NOTE: MPI_send tag set gid_
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) {
mpi::CheckValidInputs(tensors);
auto& tensor = tensors[0];
MPI_Request request = MPI_REQUEST_NULL;
{
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Isend(tensor.data(),
tensor.numel(),
mpiDatatype.at(tensor.dtype()),
dst_rank,
this->gid_,
pg_comm,
&request));
}
return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) {
mpi::CheckValidInputs(tensors);
auto& tensor = tensors[0];
MPI_Request request = MPI_REQUEST_NULL;
{
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Irecv(tensor.data(),
tensor.numel(),
mpiDatatype.at(tensor.dtype()),
src_rank,
this->gid_,
pg_comm,
&request));
}
return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
mpi::CheckValidInputs(in_tensors);
PADDLE_ENFORCE_EQ(out_tensors.size() == 1,
true,
platform::errors::InvalidArgument(
"MPI only support a single tensor op."));
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::vector<phi::DenseTensor> dst = entry->dst_;
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Allgather(data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
dst[0].data(),
data.numel(),
mpiDatatype.at(data.dtype()),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
mpi::CheckValidInputs(in_tensors);
mpi::CheckValidInputs(out_tensors);
PADDLE_ENFORCE_EQ(in_tensors[0].numel() == out_tensors[0].numel() &&
in_tensors[0].dtype() == out_tensors[0].dtype(),
true,
platform::errors::InvalidArgument(
"MPI AlltoAll: input and output are not equal in "
"size or data type."));
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
auto srcdata = (entry->src_)[0];
auto dstdata = (entry->dst_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Alltoall(srcdata.data(),
srcdata.numel() / size_,
mpiDatatype.at(srcdata.dtype()),
dstdata.data(),
dstdata.numel() / size_,
mpiDatatype.at(dstdata.dtype()),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) {
mpi::CheckValidInputs(tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
auto dataPtr = (entry->src_)[0].data();
void* sendbuf = (rank_ == opts.root_rank) ? MPI_IN_PLACE : dataPtr;
void* recvbuf = (rank_ == opts.root_rank) ? dataPtr : nullptr;
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Reduce(sendbuf,
recvbuf,
data.numel(),
mpiDatatype.at(data.dtype()),
mpi::ToMPIType(opts.reduce_op),
opts.root_rank,
pg_comm));
};
auto entry =
std::make_unique<TaskEntry>(&tensors, &tensors, std::move(runFunc));
return Enqueue(std::move(entry), tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) {
mpi::CheckValidInputs(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->dst_)[0];
void* sendbuf = nullptr;
if (rank_ == opts.root_rank) {
std::vector<phi::DenseTensor>& inputData = entry->src_;
sendbuf = inputData[0].data();
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Scatter(sendbuf,
data.numel(),
mpiDatatype.at(data.dtype()),
data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
opts.root_rank,
pg_comm));
};
if (rank_ == opts.root_rank) {
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
} else {
auto entry =
std::make_unique<TaskEntry>(nullptr, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include <condition_variable>
#include <deque>
#include <exception>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_MPI)
#include "paddle/fluid/distributed/collective/MPITools.h"
#endif
constexpr const char* MPI_BACKEND_NAME = "MPI";
namespace paddle {
namespace distributed {
struct TaskEntry {
explicit TaskEntry(std::vector<phi::DenseTensor>* src_ptr,
std::vector<phi::DenseTensor>* dst_ptr,
std::function<void(std::unique_ptr<TaskEntry>&)> run)
: dst_(dst_ptr ? *dst_ptr : std::vector<phi::DenseTensor>()),
run_(std::move(run)) {
if (src_ptr) {
src_ = *src_ptr;
}
}
TaskEntry(const TaskEntry&) = delete;
TaskEntry& operator=(const TaskEntry&) = delete;
std::vector<phi::DenseTensor> src_;
std::vector<phi::DenseTensor> dst_;
int* srcRank_ = nullptr;
std::function<void(std::unique_ptr<TaskEntry>&)> run_;
};
class ProcessGroupMPI : public ProcessGroup {
public:
class MPITask : public ProcessGroup::Task {
public:
explicit MPITask(std::vector<phi::DenseTensor> outputTensors,
const std::vector<phi::DenseTensor>& inputTensors)
: ProcessGroup::Task(-1, inputTensors, CommType::UNKNOWN),
outputs_(std::move(outputTensors)) {}
void Synchronize() { Wait(); }
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) {
std::unique_lock<std::mutex> lock(mutex_);
if (timeout == kWaitTimeout) {
// This waits without a timeout.
cv_.wait(lock, [&] { return is_completed_; });
} else {
// Waits for the user-provided timeout.
cv_.wait_for(lock, timeout, [&] { return is_completed_; });
PADDLE_ENFORCE_EQ(
is_completed_,
true,
platform::errors::InvalidArgument("MPI operation timeout! "));
}
if (exception_) {
std::rethrow_exception(exception_);
}
return true;
}
protected:
friend class ProcessGroupMPI;
private:
// about mpi
void Finish(std::exception_ptr exception = nullptr) {
is_completed_ = true;
exception_ = exception;
cv_.notify_all();
}
void FinishMPITask();
void FinishMPITaskError(std::exception_ptr eptr);
std::vector<phi::DenseTensor> outputs_;
std::condition_variable cv_;
std::exception_ptr exception_;
};
public:
class MPIAsyncTask : public ProcessGroup::Task {
public:
MPIAsyncTask(MPI_Request request,
const std::vector<phi::DenseTensor>& inputs);
bool IsCompleted();
void Synchronize() {}
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
virtual ~MPIAsyncTask();
protected:
void AppearException();
private:
std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
MPI_Request request_;
MPI_Status status_;
std::exception_ptr exception_;
};
ProcessGroupMPI(int rank, int size, MPI_Comm pgComm, int gid);
virtual ~ProcessGroupMPI();
const std::string GetBackendName() const override {
return std::string(MPI_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in,
std::vector<phi::DenseTensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions&) override;
static std::shared_ptr<ProcessGroupMPI> CreateProcessGroupMPI(
const std::vector<int>& ranks, int gid);
protected:
void workLoop();
std::shared_ptr<ProcessGroup::Task> Enqueue(
std::unique_ptr<TaskEntry> entry,
const std::vector<phi::DenseTensor>& inputs);
private:
bool stop_{false};
std::mutex pg_mutex;
std::thread worker_thread;
std::deque<std::tuple<std::unique_ptr<TaskEntry>, std::shared_ptr<MPITask>>>
queue_;
std::condition_variable queue_produce;
std::condition_variable queue_consume;
static void InitOneTimeMPI();
static void ExitMPI();
static std::once_flag onceFlag;
static std::mutex pg_global_mutex;
static int mpi_thread_support;
MPI_Comm pg_comm;
};
} // namespace distributed
} // namespace paddle
...@@ -151,6 +151,9 @@ if(WITH_PYTHON) ...@@ -151,6 +151,9 @@ if(WITH_PYTHON)
if(WITH_GLOO) if(WITH_GLOO)
set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
endif() endif()
if(WITH_MPI)
set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_mpi)
endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
if(WITH_PSCORE) if(WITH_PSCORE)
...@@ -591,6 +594,10 @@ if(WITH_PYTHON) ...@@ -591,6 +594,10 @@ if(WITH_PYTHON)
target_link_libraries(libpaddle ${ROCM_HIPRTC_LIB}) target_link_libraries(libpaddle ${ROCM_HIPRTC_LIB})
endif() endif()
if(WITH_MPI)
target_link_libraries(libpaddle ${MPI_CXX_LIBRARIES})
endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(libpaddle ${os_dependency_modules}) target_link_libraries(libpaddle ${os_dependency_modules})
add_dependencies(libpaddle op_function_generator_cmd) add_dependencies(libpaddle op_function_generator_cmd)
......
...@@ -36,6 +36,10 @@ limitations under the License. */ ...@@ -36,6 +36,10 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#endif #endif
#if defined(PADDLE_WITH_MPI)
#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#endif #endif
...@@ -623,6 +627,25 @@ void BindDistributed(py::module *m) { ...@@ -623,6 +627,25 @@ void BindDistributed(py::module *m) {
#endif #endif
#if defined(PADDLE_WITH_MPI)
py::class_<distributed::ProcessGroupMPI,
std::shared_ptr<distributed::ProcessGroupMPI>>(
*m, "ProcessGroupMPI", ProcessGroup)
.def_static(
"create",
[](const std::vector<int> &ranks,
int gid) -> std::shared_ptr<distributed::ProcessGroupMPI> {
return paddle::distributed::ProcessGroupMPI::CreateProcessGroupMPI(
ranks, gid);
})
.def("get_rank",
&distributed::ProcessGroup::GetRank,
py::call_guard<py::gil_scoped_release>())
.def("get_world_size",
&distributed::ProcessGroup::GetSize,
py::call_guard<py::gil_scoped_release>());
#endif
#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
py::class_<distributed::ProcessGroupHeter, py::class_<distributed::ProcessGroupHeter,
......
...@@ -229,6 +229,23 @@ bool IsCompiledWithNCCL() { ...@@ -229,6 +229,23 @@ bool IsCompiledWithNCCL() {
#endif #endif
} }
bool IsCompiledWithMPI() {
#ifdef PADDLE_WITH_MPI
return true;
#else
return false;
#endif
}
// NOTE some mpi lib can support cuda aware, support it in the future.
bool IsCompiledWithMPIAWARE() {
#ifdef PADDLE_WITH_MPI_AWARE
return true;
#else
return false;
#endif
}
bool IsCompiledWithROCM() { bool IsCompiledWithROCM() {
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
return false; return false;
...@@ -1718,6 +1735,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1718,6 +1735,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_nccl", IsCompiledWithNCCL); m.def("is_compiled_with_nccl", IsCompiledWithNCCL);
m.def("is_compiled_with_mpi", IsCompiledWithMPI);
m.def("is_compiled_with_mpi_aware", IsCompiledWithMPIAWARE);
m.def("is_compiled_with_cinn", IsCompiledWithCINN); m.def("is_compiled_with_cinn", IsCompiledWithCINN);
m.def("is_compiled_with_mlu", IsCompiledWithMLU); m.def("is_compiled_with_mlu", IsCompiledWithMLU);
m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS); m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
......
...@@ -323,5 +323,17 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX)) ...@@ -323,5 +323,17 @@ if((WITH_ROCM OR WITH_GPU) AND (LINUX))
"PADDLE_DIST_UT_PORT=21532;http_proxy=;https_proxy=") "PADDLE_DIST_UT_PORT=21532;http_proxy=;https_proxy=")
set_tests_properties(test_world_size_and_rank PROPERTIES TIMEOUT "120") set_tests_properties(test_world_size_and_rank PROPERTIES TIMEOUT "120")
endif() endif()
if(WITH_MPI)
if(LOCAL_ALL_ARCH AND (LINUX))
bash_test_modules(
test_mpi_comm
START_BASH
test_mpi_comm.sh
LABELS
"RUN_TYPE=DIST"
ENVS
"PADDLE_DIST_UT_PORT=21672;http_proxy=;https_proxy=")
endif()
endif()
add_subdirectory(fleet) add_subdirectory(fleet)
add_subdirectory(multinode) add_subdirectory(multinode)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import random
import numpy as np
import os
import shutil
import paddle
from paddle.fluid import core
from datetime import timedelta
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.distributed.collective import Group
from paddle.distributed.collective import _group_map_by_name
from paddle.distributed.collective import _default_group_name
from paddle.distributed.collective import _set_group_map
from paddle.distributed.collective import _set_group_map_by_name
from paddle.distributed.collective import _set_group_map_backend
from paddle.fluid.framework import _set_expected_place
import paddle.distributed as dist
import ctypes
ctypes.CDLL("libmpi.so", mode=ctypes.RTLD_GLOBAL)
def init_process_group(strategy=None):
gid = 0
pg = core.ProcessGroupMPI.create([], gid)
rank = pg.get_rank()
world_size = pg.get_world_size()
# support CPU
place = core.CPUPlace()
_set_expected_place(place)
group = Group(rank,
world_size,
id=0,
ranks=list(range(world_size)),
pg=pg,
name=_default_group_name)
_set_group_map_by_name(_default_group_name, group)
_set_group_map(gid, group)
_set_group_map_backend(group, "mpi")
return group
def test_allreduce_sum(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
sum_result = tensor_x + tensor_y
if pg.rank() == 0:
task = dist.all_reduce(tensor_x)
assert np.array_equal(tensor_x, sum_result)
else:
task = dist.all_reduce(tensor_y)
assert np.array_equal(tensor_y, sum_result)
print("test allreduce sum api ok")
def test_allreduce_max(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
max_result = paddle.maximum(tensor_x, tensor_y)
if pg.rank() == 0:
task = dist.all_reduce(tensor_x,
dist.ReduceOp.MAX,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, max_result)
else:
task = dist.all_reduce(tensor_y,
dist.ReduceOp.MAX,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_y, max_result)
print("test allreduce max api ok")
def test_allreduce_min(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
min_result = paddle.minimum(tensor_x, tensor_y)
if pg.rank() == 0:
task = dist.all_reduce(tensor_x,
dist.ReduceOp.MIN,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, min_result)
else:
task = dist.all_reduce(tensor_y,
dist.ReduceOp.MIN,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_y, min_result)
print("test allreduce min api ok")
def test_allreduce_prod(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
prod_result = np.multiply(x, y)
if pg.rank() == 0:
task = dist.all_reduce(tensor_x,
dist.ReduceOp.PROD,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, prod_result)
else:
task = dist.all_reduce(tensor_y,
dist.ReduceOp.PROD,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_y, prod_result)
print("test allreduce prod api ok")
def test_broadcast(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
broadcast_result = paddle.assign(tensor_x)
if pg.rank() == 0:
task = dist.broadcast(tensor_x, 0, use_calc_stream=False)
task.synchronize()
assert task.is_completed()
assert np.array_equal(broadcast_result, tensor_x)
else:
task = dist.broadcast(tensor_y, 0)
assert np.array_equal(broadcast_result, tensor_y)
print("test broadcast api ok")
def test_barrair(pg):
# rank 0
if pg.rank() == 0:
dist.barrier()
# rank 1
else:
task = pg.barrier()
task.wait()
print("test barrier api ok\n")
def test_allgather(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
y = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
out_shape = list(shape)
out_shape[0] *= 2
out = np.random.random(out_shape).astype(dtype)
tensor_out = paddle.to_tensor(out)
if pg.rank() == 0:
task = pg.all_gather(tensor_x, tensor_out)
task.wait()
# rank 1
else:
tensor_out_list = [
paddle.empty_like(tensor_x),
paddle.empty_like(tensor_x)
]
task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
tensor_out = paddle.concat(tensor_out_list)
out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
assert np.array_equal(tensor_x, out_1)
assert np.array_equal(tensor_y, out_2)
print("test allgather api ok\n")
if pg.rank() == 0:
task = pg.all_gather(tensor_x, tensor_out)
task.wait()
# rank 1
else:
tensor_out_list = []
task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False)
tensor_out = paddle.concat(tensor_out_list)
out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]])
assert np.array_equal(tensor_x, out_1)
assert np.array_equal(tensor_y, out_2)
print("test allgather api2 ok\n")
def test_all2all(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
y = np.random.random(shape).astype(dtype)
out1 = np.random.random(shape).astype(dtype)
out2 = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
tensor_out1 = paddle.to_tensor(out1)
tensor_out2 = paddle.to_tensor(out2)
raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
if pg.rank() == 0:
task = pg.alltoall(tensor_x, tensor_out1)
task.wait()
# rank 1
else:
in_1, in_2 = paddle.split(tensor_y, 2)
out_1, out_2 = paddle.split(tensor_out2, 2)
out_tensor_list = [out_1, out_2]
task = dist.alltoall([in_1, in_2], out_tensor_list)
tensor_out2 = paddle.concat(out_tensor_list)
out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
if pg.rank() == 0:
assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
else:
assert np.array_equal(out2_1, raw_tensor_x_2)
print("test alltoall api ok\n")
x = np.random.random(shape).astype(dtype)
y = np.random.random(shape).astype(dtype)
out1 = np.random.random(shape).astype(dtype)
out2 = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
tensor_out1 = paddle.to_tensor(out1)
tensor_out2 = paddle.to_tensor(out2)
raw_tensor_x_2 = paddle.slice(tensor_x, [0], [shape[0] // 2], [shape[0]])
raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [shape[0] // 2])
if pg.rank() == 0:
task = pg.alltoall(tensor_x, tensor_out1)
task.wait()
# rank 1
else:
in_1, in_2 = paddle.split(tensor_y, 2)
out_1, out_2 = paddle.split(tensor_out2, 2)
out_tensor_list = []
task = dist.alltoall([in_1, in_2], out_tensor_list)
tensor_out2 = paddle.concat(out_tensor_list)
out1_2 = paddle.slice(tensor_out1, [0], [shape[0] // 2], [shape[0]])
out2_1 = paddle.slice(tensor_out2, [0], [0], [shape[0] // 2])
if pg.rank() == 0:
assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
else:
assert np.array_equal(out2_1, raw_tensor_x_2)
print("test alltoall api2 ok\n")
def test_reduce_sum(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
y = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
sum_result = tensor_x + tensor_y
if pg.rank() == 0:
task = dist.reduce(tensor_x, 0, use_calc_stream=True)
# rank 1
else:
task = dist.reduce(tensor_y, 0, use_calc_stream=False)
task.wait()
if pg.rank() == 0:
assert np.array_equal(tensor_x, sum_result)
print("test reduce sum api ok\n")
def test_reduce_max(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
max_result = paddle.maximum(tensor_x, tensor_y)
if pg.rank() == 0:
task = dist.reduce(tensor_x,
0,
dist.ReduceOp.MAX,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, max_result)
else:
task = dist.reduce(tensor_y,
0,
dist.ReduceOp.MAX,
use_calc_stream=False)
task.wait()
print("test reduce max api ok")
def test_reduce_min(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
min_result = paddle.minimum(tensor_x, tensor_y)
if pg.rank() == 0:
task = dist.reduce(tensor_x,
0,
dist.ReduceOp.MIN,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, min_result)
else:
task = dist.reduce(tensor_y,
0,
dist.ReduceOp.MIN,
use_calc_stream=False)
task.wait()
print("test reduce min api ok")
def test_reduce_prod(pg, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
prod_result = np.multiply(x, y)
if pg.rank() == 0:
task = dist.reduce(tensor_x,
0,
dist.ReduceOp.PROD,
use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_x, prod_result)
else:
task = dist.reduce(tensor_y,
0,
dist.ReduceOp.PROD,
use_calc_stream=False)
task.wait()
print("test reduce prod api ok")
def test_scatter(pg, shape, dtype):
# rank 0
in_shape = list(shape)
in_shape[0] *= 2
x = np.random.random(in_shape).astype(dtype)
y = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
if pg.rank() == 0:
in_1, in_2 = paddle.split(tensor_x, 2)
task = dist.scatter(tensor_y, [in_1, in_2], 0, use_calc_stream=True)
# rank 1
else:
task = dist.scatter(tensor_y, [], 0, use_calc_stream=False)
task.wait()
out1 = paddle.slice(tensor_x, [0], [0], [shape[0]])
out2 = paddle.slice(tensor_x, [0], [shape[0]], [shape[0] * 2])
if pg.rank() == 0:
assert np.array_equal(tensor_y, out1)
else:
assert np.array_equal(tensor_y, out2)
print("test scatter api ok\n")
def test_send_recv(pg, sub_group, shape, dtype):
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
if pg.rank() == 0:
task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=False)
task.wait()
elif pg.rank() == 1:
task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=False)
task.wait()
assert np.array_equal(tensor_y, tensor_x)
print("test send api ok")
# test send min
# rank 0
x = np.random.random(shape).astype(dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(shape).astype(dtype)
tensor_y = paddle.to_tensor(y)
if pg.rank() == 0:
task = dist.send(tensor_x, 1, group=sub_group, use_calc_stream=True)
elif pg.rank() == 1:
task = dist.recv(tensor_y, 0, group=sub_group, use_calc_stream=True)
assert np.array_equal(tensor_y, tensor_x)
print("test send api ok")
class TestProcessGroup(unittest.TestCase):
def setUp(self):
paddle.seed(2022)
random.seed(2022)
np.random.seed(2022)
self.config()
def config(self):
self.dtype = "float32"
self.shape = (2, 10, 5)
def test_create_process_group_mpi(self):
with _test_eager_guard():
group = init_process_group()
pg = group.process_group
# test allreduce sum
test_allreduce_sum(pg, self.shape, self.dtype)
# test allreduce max
test_allreduce_max(pg, self.shape, self.dtype)
# test allreduce min
test_allreduce_min(pg, self.shape, self.dtype)
# test allreduce prod
test_allreduce_prod(pg, self.shape, self.dtype)
# test broadcast
test_broadcast(pg, self.shape, self.dtype)
# test barrier
test_barrair(pg)
# test allgather
test_allgather(pg, self.shape, self.dtype)
# test alltoall
test_all2all(pg, self.shape, self.dtype)
# test Reduce
test_reduce_sum(pg, self.shape, self.dtype)
# test reduce max
test_reduce_max(pg, self.shape, self.dtype)
# test reduce min
test_reduce_min(pg, self.shape, self.dtype)
# test reduce product
test_reduce_prod(pg, self.shape, self.dtype)
# test Scatter
test_scatter(pg, self.shape, self.dtype)
# test send recv.
test_send_recv(pg, group, self.shape, self.dtype)
if __name__ == "__main__":
unittest.main()
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# use default values
export PADDLE_DISTRI_BACKEND="mpi"
cmd=`which mpirun`
if [ ${#cmd} -eq 0 ]
then
echo "Warning! mpirun command not found!"
else
${cmd} -x PADDLE_DISTRI_BACKEND -np 2 --allow-run-as-root python3.8 process_group_mpi.py
fi
...@@ -38,3 +38,4 @@ test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_ ...@@ -38,3 +38,4 @@ test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_
test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=, test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
test_mpi_comm,linux,,,DIST,test_mpi_comm.sh,2,,http_proxy=;https_proxy=,WITH_MPI
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册