未验证 提交 ade51aa5 编写于 作者: G Ghost Screaming 提交者: GitHub

[Auto Parallel] Compatible new comm library upgrade (#56604)

* for verify

fluid operator support new comm library

* u

* u

* u

* compatiable new comm library upgrade for c_allgather, c_reduce, c_reduce_scatter and c_scatter.

* Remove useless comments in process_group.py

* Polish code style.

* Fix some problems.

* Remove use fluid api in phi comm_context_manager.

* Add PPADDLE_WITH_CUDA and PADDLE_WITH_NCCL micro judgement.

* Fix bug of HIP architecture.

* Fix some problems.
1. remove useless loggings.
2. Fix conditional compilation for HIP.
3. Fix problems of test_pass_generation_pipeline.py. It calls paddle.distributed.init_parallel_env() at first,
then auto.Engine calls _init_comm(), which will calls process_group.instantiate(). However, init_parallel_env() will call
paddle.distributed.barrier(), it will call CreateNCCLEnvCache and create corresponding NCCLCommContext. But dev_id is not
set, as a result, NCCLCommContext's dev_ctx is not initialized.

* Fix some problems.

* Polish code.

* Polish code.

* Revert compatiable upgrade for communication operators. Their upgrades
will be submitted in another PR.

* Remove StaticTCPStore.

* Remove useless modification.

* Remove useless set_cuda_device_id.

* Polish code.

* Remove fluid header files in phi files.

* Remove useless comments.

* Fix problems of hip arch.

* Fix some problems.

* Polish code.

* Polish code style.

---------
Co-authored-by: TaoTao Li's avatarhitywt <yuwentao126@126.com>
上级 ded10442
...@@ -57,6 +57,11 @@ limitations under the License. */ ...@@ -57,6 +57,11 @@ limitations under the License. */
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/custom_kernel.h"
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif
PHI_DECLARE_int32(paddle_num_threads); PHI_DECLARE_int32(paddle_num_threads);
PADDLE_DEFINE_EXPORTED_int32( PADDLE_DEFINE_EXPORTED_int32(
multiple_of_cupti_buffer_size, multiple_of_cupti_buffer_size,
...@@ -440,6 +445,41 @@ void InitMemoryMethod() { ...@@ -440,6 +445,41 @@ void InitMemoryMethod() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
#endif #endif
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
// TODO(GhostScreaming): Use phi methods later.
memory_method->get_allocator =
[](int device_id, phi::gpuStream_t stream) -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::GPUPlace(device_id), stream)
.get();
};
memory_method->get_host_allocator = []() -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get();
};
memory_method->get_zero_allocator = [](int device_id) -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::GPUPlace(device_id))
.get();
};
memory_method->get_host_zero_allocator = []() -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get();
};
memory_method->get_pinned_allocator = []() -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::GPUPinnedPlace())
.get();
};
memory_method->get_new_cuda_event = [](int device_id) {
return paddle::platform::CudaEventResourcePool::Instance().New(device_id);
};
#endif
memory_method->emplace_device_contexts = memory_method->emplace_device_contexts =
paddle::platform::EmplaceDeviceContexts; paddle::platform::EmplaceDeviceContexts;
memory_method->init_devices = InitDevices; memory_method->init_devices = InitDevices;
......
...@@ -42,14 +42,14 @@ void BindCommContextManager(py::module *m) { ...@@ -42,14 +42,14 @@ void BindCommContextManager(py::module *m) {
py::class_<phi::distributed::CommContextManager, py::class_<phi::distributed::CommContextManager,
std::shared_ptr<phi::distributed::CommContextManager>>( std::shared_ptr<phi::distributed::CommContextManager>>(
*m, "CommContextManager") *m, "CommContextManager")
.def_static("set_device_id",
&phi::distributed::CommContextManager::SetDeviceId,
py::call_guard<py::gil_scoped_release>())
#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
.def_static( .def_static(
"create_nccl_comm_context", "create_nccl_comm_context",
&phi::distributed::CommContextManager::CreateNCCLCommContext, &phi::distributed::CommContextManager::CreateNCCLCommContext,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def_static("set_cuda_device_id",
&phi::distributed::CommContextManager::SetCUDADeviceId,
py::call_guard<py::gil_scoped_release>())
#endif #endif
#if defined(PADDLE_WITH_GLOO) #if defined(PADDLE_WITH_GLOO)
.def_static( .def_static(
......
...@@ -90,6 +90,34 @@ void EmplaceDeviceContexts( ...@@ -90,6 +90,34 @@ void EmplaceDeviceContexts(
stream_priority); stream_priority);
} }
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
return MemoryUtils::Instance().GetAllocator(device_id, stream);
}
const phi::Allocator* GetHostAllocator() {
return MemoryUtils::Instance().GetHostAllocator();
}
const phi::Allocator* GetZeroAllocator(int device_id) {
return MemoryUtils::Instance().GetZeroAllocator(device_id);
}
const phi::Allocator* GetHostZeroAllocator() {
return MemoryUtils::Instance().GetHostZeroAllocator();
}
const phi::Allocator* GetPinnedAllocator() {
return MemoryUtils::Instance().GetPinnedAllocator();
}
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
int device_id) {
return MemoryUtils::Instance().GetCudaEvent(device_id);
}
#endif
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -24,6 +24,15 @@ ...@@ -24,6 +24,15 @@
#include "paddle/phi/core/macros.h" #include "paddle/phi/core/macros.h"
#include "paddle/phi/core/stream.h" #include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace phi { namespace phi {
struct MemoryInterface { struct MemoryInterface {
...@@ -150,6 +159,17 @@ struct MemoryInterface { ...@@ -150,6 +159,17 @@ struct MemoryInterface {
const std::vector<phi::Place>& places, const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority); int stream_priority);
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream);
phi::Allocator* (*get_host_allocator)();
phi::Allocator* (*get_zero_allocator)(int device_id);
phi::Allocator* (*get_host_zero_allocator)();
phi::Allocator* (*get_pinned_allocator)();
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> (
*get_new_cuda_event)(int device_id);
#endif
}; };
class MemoryUtils { class MemoryUtils {
...@@ -323,6 +343,34 @@ class MemoryUtils { ...@@ -323,6 +343,34 @@ class MemoryUtils {
"Fluid. You can call InitMemoryMethod() for initialization.")); "Fluid. You can call InitMemoryMethod() for initialization."));
} }
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
return memory_method_->get_allocator(device_id, stream);
}
const phi::Allocator* GetHostAllocator() {
return memory_method_->get_host_allocator();
}
const phi::Allocator* GetZeroAllocator(int device_id) {
return memory_method_->get_zero_allocator(device_id);
}
const phi::Allocator* GetHostZeroAllocator() {
return memory_method_->get_host_zero_allocator();
}
const phi::Allocator* GetPinnedAllocator() {
return memory_method_->get_pinned_allocator();
}
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
int device_id) {
return memory_method_->get_new_cuda_event(device_id);
}
#endif
private: private:
MemoryUtils() = default; MemoryUtils() = default;
...@@ -385,6 +433,22 @@ void EmplaceDeviceContexts( ...@@ -385,6 +433,22 @@ void EmplaceDeviceContexts(
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority); int stream_priority);
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream);
const Allocator* GetHostAllocator();
const Allocator* GetZeroAllocator(int device_id);
const Allocator* GetHostZeroAllocator();
const Allocator* GetPinnedAllocator();
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
int device_id);
#endif
class Buffer { class Buffer {
public: public:
explicit Buffer(const phi::Place& place) : place_(place) {} explicit Buffer(const phi::Place& place) : place_(place) {}
......
...@@ -12,44 +12,53 @@ ...@@ -12,44 +12,53 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_GLOO)
#include <gloo/rendezvous/prefix_store.h>
#include "paddle/phi/core/distributed/gloo_comm_context.h"
#include "paddle/phi/core/distributed/gloo_utils.h"
#include "paddle/phi/core/distributed/store/gloo_store.h"
#endif
#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/comm_context_manager.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "glog/logging.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/distributed/store/store.h" #include "paddle/phi/core/distributed/store/store.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#if defined(PADDLE_WITH_GLOO)
#include <gloo/rendezvous/prefix_store.h>
#include "paddle/phi/core/distributed/gloo_comm_context.h"
#include "paddle/phi/core/distributed/gloo_utils.h"
#include "paddle/phi/core/distributed/store/gloo_store.h"
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/distributed/nccl_comm_context.h"
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "glog/logging.h"
#include "paddle/phi/core/distributed/xccl_comm_context.h" #include "paddle/phi/core/distributed/xccl_comm_context.h"
#endif #endif
namespace phi { namespace phi {
namespace distributed { namespace distributed {
int CommContextManager::device_id = -1;
void CommContextManager::SetDeviceId(int dev_id) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void CommContextManager::SetCUDADeviceId(int dev_id) {
phi::backends::gpu::SetDeviceId(dev_id); phi::backends::gpu::SetDeviceId(dev_id);
CommContextManager::device_id = dev_id;
#endif
} }
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void CommContextManager::CreateNCCLCommContext( void CommContextManager::CreateNCCLCommContext(
const std::shared_ptr<Store>& store, const std::shared_ptr<Store>& store,
const std::string& unique_comm_key, const std::string& unique_comm_key,
int rank, int rank,
int size) { int size) {
auto& comm_context_manager = CommContextManager::GetInstance();
if (comm_context_manager.Has(unique_comm_key)) {
return;
}
ncclUniqueId nccl_id; ncclUniqueId nccl_id;
if (rank == 0) { if (rank == 0) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id)); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
...@@ -68,7 +77,28 @@ void CommContextManager::CreateNCCLCommContext( ...@@ -68,7 +77,28 @@ void CommContextManager::CreateNCCLCommContext(
auto nccl_comm_context = auto nccl_comm_context =
std::make_unique<NCCLCommContext>(rank, size, nccl_id); std::make_unique<NCCLCommContext>(rank, size, nccl_id);
auto& comm_context_manager = CommContextManager::GetInstance();
if (CommContextManager::device_id != -1) {
std::unique_ptr<phi::GPUContext> dev_ctx(
new phi::GPUContext(phi::GPUPlace(CommContextManager::device_id)));
dev_ctx->SetAllocator(phi::memory_utils::GetAllocator(
CommContextManager::device_id, dev_ctx->stream()));
dev_ctx->SetHostAllocator(phi::memory_utils::GetHostAllocator());
dev_ctx->SetZeroAllocator(
phi::memory_utils::GetZeroAllocator(CommContextManager::device_id));
dev_ctx->SetHostZeroAllocator(phi::memory_utils::GetHostZeroAllocator());
dev_ctx->SetPinnedAllocator(phi::memory_utils::GetPinnedAllocator());
dev_ctx->PartialInitWithAllocator();
auto compute_event =
phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
auto comm_event =
phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
nccl_comm_context->SetDevContext(std::move(dev_ctx));
nccl_comm_context->SetComputeEvent(std::move(compute_event));
nccl_comm_context->SetCommEvent(std::move(comm_event));
}
comm_context_manager.SetStore(store); comm_context_manager.SetStore(store);
comm_context_manager.Emplace(unique_comm_key, std::move(nccl_comm_context)); comm_context_manager.Emplace(unique_comm_key, std::move(nccl_comm_context));
} }
......
...@@ -46,13 +46,13 @@ class CommContextManager { ...@@ -46,13 +46,13 @@ class CommContextManager {
bool Has(const std::string& unique_comm_key) const; bool Has(const std::string& unique_comm_key) const;
static void SetDeviceId(int dev_id);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
static void CreateNCCLCommContext(const std::shared_ptr<Store>& store, static void CreateNCCLCommContext(const std::shared_ptr<Store>& store,
const std::string& unique_comm_key, const std::string& unique_comm_key,
int rank, int rank,
int size); int size);
static void SetCUDADeviceId(int dev_id);
#endif #endif
#if defined(PADDLE_WITH_GLOO) #if defined(PADDLE_WITH_GLOO)
...@@ -76,6 +76,7 @@ class CommContextManager { ...@@ -76,6 +76,7 @@ class CommContextManager {
std::unordered_map<std::string, std::unique_ptr<CommContext>> std::unordered_map<std::string, std::unique_ptr<CommContext>>
id_to_comm_context_; id_to_comm_context_;
std::shared_ptr<Store> store_; std::shared_ptr<Store> store_;
static int device_id;
}; };
} // namespace distributed } // namespace distributed
......
...@@ -37,6 +37,30 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id) ...@@ -37,6 +37,30 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
phi::GPUContext* NCCLCommContext::GetDevContext() { return dev_ctx_.get(); }
void NCCLCommContext::SetDevContext(
std::unique_ptr<phi::GPUContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
}
gpuEvent_t NCCLCommContext::GetComputeEvent() { return compute_event_.get(); }
void NCCLCommContext::SetComputeEvent(
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
compute_event) {
compute_event_ = std::move(compute_event);
}
gpuEvent_t NCCLCommContext::GetCommEvent() { return comm_event_.get(); }
void NCCLCommContext::SetCommEvent(
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event) {
comm_event_ = std::move(comm_event);
}
void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor, void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor, const phi::DenseTensor& in_tensor,
int root, int root,
......
...@@ -13,6 +13,16 @@ ...@@ -13,6 +13,16 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/core/distributed/comm_context.h" #include "paddle/phi/core/distributed/comm_context.h"
#include "paddle/phi/core/macros.h" #include "paddle/phi/core/macros.h"
...@@ -30,9 +40,27 @@ namespace distributed { ...@@ -30,9 +40,27 @@ namespace distributed {
class NCCLCommContext final : public CommContext { class NCCLCommContext final : public CommContext {
public: public:
NCCLCommContext(int rank, int size, ncclUniqueId nccl_id); NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
~NCCLCommContext() {}
ncclComm_t GetNcclComm(); ncclComm_t GetNcclComm();
gpuStream_t GetStream();
gpuEvent_t GetComputeEvent();
void SetComputeEvent(
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
compute_event);
gpuEvent_t GetCommEvent();
void SetCommEvent(
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event);
phi::GPUContext* GetDevContext();
void SetDevContext(std::unique_ptr<phi::GPUContext>&& dev_ctx);
void Broadcast(phi::DenseTensor* out_tensor, void Broadcast(phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor, const phi::DenseTensor& in_tensor,
int root, int root,
...@@ -75,6 +103,14 @@ class NCCLCommContext final : public CommContext { ...@@ -75,6 +103,14 @@ class NCCLCommContext final : public CommContext {
DISABLE_COPY_AND_ASSIGN(NCCLCommContext); DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
ncclComm_t nccl_comm_; ncclComm_t nccl_comm_;
std::unique_ptr<phi::GPUContext> dev_ctx_;
// used for comm wait compute, compute_stream-->event-->comm_stream
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> compute_event_;
// used for compute wait comm, comm_stream-->event-->compute_stream
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> comm_event_;
}; };
} // namespace distributed } // namespace distributed
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License # limitations under the License
import os
from collections import OrderedDict from collections import OrderedDict
import paddle import paddle
...@@ -146,9 +147,6 @@ class ProcessGroup: ...@@ -146,9 +147,6 @@ class ProcessGroup:
global_rank = genv.rank global_rank = genv.rank
if self.nranks >= 2 and global_rank in self.ranks: if self.nranks >= 2 and global_rank in self.ranks:
logger.info(
f"group_id: {self.id}, ranks: {self.ranks}, nranks: {self.nranks}, trainer_endpoints: {genv.current_endpoint}"
)
strategy = core.ParallelStrategy() strategy = core.ParallelStrategy()
strategy.nranks = self.nranks strategy.nranks = self.nranks
strategy.local_rank = self.local_rank(global_rank) strategy.local_rank = self.local_rank(global_rank)
...@@ -159,9 +157,22 @@ class ProcessGroup: ...@@ -159,9 +157,22 @@ class ProcessGroup:
strategy.nrings = 1 strategy.nrings = 1
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place = core.CUDAPlace(genv.device_id) place = core.CUDAPlace(genv.device_id)
core.NCCLParallelContext(strategy, place).init_with_ring_id( use_new_comm = os.getenv(
ring_id "FLAGS_dynamic_static_unified_comm", "0"
) )
if use_new_comm in ["1", "True", "true"]:
store = core.create_or_get_global_tcp_store()
core.CommContextManager.set_device_id(genv.device_id)
core.CommContextManager.create_nccl_comm_context(
store,
str(ring_id),
strategy.local_rank,
strategy.nranks,
)
else:
core.NCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
place = core.XPUPlace(genv.device_id) place = core.XPUPlace(genv.device_id)
core.BKCLParallelContext(strategy, place).init_with_ring_id( core.BKCLParallelContext(strategy, place).init_with_ring_id(
......
...@@ -330,7 +330,7 @@ def _init_parallel_env(backend): ...@@ -330,7 +330,7 @@ def _init_parallel_env(backend):
store, "0", rank, world_size store, "0", rank, world_size
) )
elif backend == "nccl": elif backend == "nccl":
core.CommContextManager.set_cuda_device_id(dev_id) core.CommContextManager.set_device_id(dev_id)
core.CommContextManager.create_nccl_comm_context( core.CommContextManager.create_nccl_comm_context(
store, "0", rank, world_size store, "0", rank, world_size
) )
......
...@@ -1084,13 +1084,7 @@ def init_parallel_env(): ...@@ -1084,13 +1084,7 @@ def init_parallel_env():
master_port = int(master_port) master_port = int(master_port)
is_master = rank == 0 is_master = rank == 0
stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
default_store = core.TCPStore( default_store = core.create_or_get_global_tcp_store()
master_addr,
master_port,
is_master,
world_size,
timeout=stop_check_timeout,
)
_set_default_store(default_store) _set_default_store(default_store)
pg = _new_process_group_impl( pg = _new_process_group_impl(
backend, backend,
...@@ -1108,6 +1102,11 @@ def init_parallel_env(): ...@@ -1108,6 +1102,11 @@ def init_parallel_env():
_add_new_group(group) _add_new_group(group)
parallel_helper._set_parallel_ctx(True) parallel_helper._set_parallel_ctx(True)
# barrier will call CreateNCCLEnvCache which will call CreateNCCLCommContext.
# Set device_id to prevent creating null dev_ctx.
# TODO(mine): support XPU and other backends.
if backend in ["nccl", 'xccl', 'bkcl']:
core.CommContextManager.set_device_id(parallel_env.device_id)
paddle.distributed.barrier(group=group) paddle.distributed.barrier(group=group)
return group return group
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册