未验证 提交 9fcdaeba 编写于 作者: L lw921014 提交者: GitHub

add allreduce and broadcast without test (#31024)

add allreduce and broadcast without test
上级 5618f140
...@@ -62,6 +62,7 @@ endif() ...@@ -62,6 +62,7 @@ endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
...@@ -73,6 +74,9 @@ if(WITH_ASCEND_CL) ...@@ -73,6 +74,9 @@ if(WITH_ASCEND_CL)
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
......
...@@ -205,8 +205,11 @@ elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) ...@@ -205,8 +205,11 @@ elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0) SET(PROTOBUF_TAG v3.8.0)
else() else()
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) SET(PROTOBUF_TAG v3.8.0)
# SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
# SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
endif() endif()
cache_third_party(${TARGET_NAME} cache_third_party(${TARGET_NAME}
......
...@@ -151,6 +151,8 @@ set(COMMON_FLAGS ...@@ -151,6 +151,8 @@ set(COMMON_FLAGS
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
-Wno-error=nonnull-compare # Warning in boost gcc 7.2
-Wno-error=address # Warning in boost gcc 7.2
${fsanitize} ${fsanitize}
) )
......
...@@ -79,6 +79,7 @@ class AllocatorFacadePrivate { ...@@ -79,6 +79,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
VLOG(3) << "npu num: " <<platform::GetNPUDeviceCount();
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
} }
...@@ -141,6 +142,7 @@ class AllocatorFacadePrivate { ...@@ -141,6 +142,7 @@ class AllocatorFacadePrivate {
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: allocators_) : allocators_)
: zero_size_allocators_); : zero_size_allocators_);
VLOG(3) <<size;
auto iter = allocators.find(place); auto iter = allocators.find(place);
PADDLE_ENFORCE_NE(iter, allocators.end(), PADDLE_ENFORCE_NE(iter, allocators.end(),
platform::errors::NotFound( platform::errors::NotFound(
......
...@@ -24,6 +24,7 @@ namespace memory { ...@@ -24,6 +24,7 @@ namespace memory {
namespace allocation { namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() { static AllocatorStrategy GetStrategyFromFlag() {
VLOG(3) << "FLAGS_allocator_strategy" << FLAGS_allocator_strategy;
if (FLAGS_allocator_strategy == "naive_best_fit") { if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit; return AllocatorStrategy::kNaiveBestFit;
} }
......
...@@ -11,24 +11,29 @@ foreach(src ${OPS}) ...@@ -11,24 +11,29 @@ foreach(src ${OPS})
set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
endforeach() endforeach()
register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
if(WITH_NCCL) if(WITH_NCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
endif() endif()
if(WITH_ASCEND) if(WITH_GLOO)
op_library(gen_nccl_id_op) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
op_library(c_gen_nccl_id_op)
endif() endif()
if(WITH_XPU_BKCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
endif()
if(WITH_GLOO) if(WITH_ASCEND_CL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
endif() endif()
set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
cc_test(c_hcom_op_npu_test SRCS c_hcom_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_comm_init_hccl_op c_create_group_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_max,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_min,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
...@@ -30,6 +30,11 @@ limitations under the License. */ ...@@ -30,6 +30,11 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -105,6 +110,88 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> { ...@@ -105,6 +110,88 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
} }
}; };
template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
// void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->mutable_data<T>(place)));
out->Resize(in->dims());
// void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->mutable_data<T>(place)));
// void* recvbuff = sendbuff;
std::string tag = ctx.Attr<std::string>("tag");
int ring_id = ctx.Attr<int>("ring_id");
// s他的:
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
group = "hccl_world_group";// std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm = paddle::platform::HCCLCommContext::Instance().Get();
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REP_OP_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REP_OP_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REP_OP_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REP_OP_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl allreduce, parameter is: "
<< "input num: " << numel
<< "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type
<< ", group is: " << group
<< ", tag is " << tag;
printf("sendbuff: %p\n", sendbuff);
printf("recvbuff: %p\n", recvbuff);
// printf("sendbuff: %p, %d\n", sendbuff, ((int*)sendbuff)[0]);
// printf("recvbuff: %p, %d\n", recvbuff, ((int*)recvbuff)[0]);
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
template <ReduceType red_type, typename T> template <ReduceType red_type, typename T>
class CAllReduceOpCUDAKernel : public framework::OpKernel<T> { class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -114,7 +201,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> { ...@@ -114,7 +201,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); ncclDataType_t dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel(); int64_t numel = in->numel();
const void* sendbuff = in->data<void>(); const void* sendbuff = in->data<void>();
out->Resize(in->dims()); out->Resize(in->dims());
...@@ -170,6 +257,11 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -170,6 +257,11 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result."); AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
#pragma message("hccl CAllReduceOpMaker need tag attr")
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_prod,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_sum,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
...@@ -42,6 +42,11 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,6 +42,11 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0); .SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.") AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
#pragma message("tag")
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<framework::LoDTensor>("X");
void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
auto comm = paddle::platform::HCCLCommContext::Instance().Get();
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
int ring_id = ctx.Attr<int>("ring_id");
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
std::string tag = ctx.Attr<std::string>("tag");
VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
<< ", group is " << group
<< ", tag is " << tag;
if (root == static_cast<int>(comm->rank())) {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
dtype, (uint32_t)root, group.c_str(), (void*)stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
<< x->numel();
} else {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
dtype, (uint32_t)root, group.c_str(), (void*)stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
<< framework::product(out->dims());
}
if (out != x) {
framework::TensorCopy(
*static_cast<const framework::Tensor*>(x), place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<framework::Tensor*>(out));
}
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_broadcast,
ops::CBroadcastOpASCENDKernel<float>,
ops::CBroadcastOpASCENDKernel<int>,
ops::CBroadcastOpASCENDKernel<int8_t>,
ops::CBroadcastOpASCENDKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
class CCommInitOpNPU : public framework::OperatorBase {
public:
CCommInitOpNPU(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::string rank_table_file = Attr<std::string>("rank_table_file");
uint32_t rank_id = Attr<int>("rank_id");
uint32_t device_id = Attr<int>("device_id");
VLOG(3) << "begin init hccl, parameter is: "
<< "rank_table_file " << rank_table_file
<< " rank_id " << rank_id
<< " device_id " << device_id;
platform::HCCLCommContext::Instance().CreateHCCLComm(rank_table_file, rank_id, device_id);
}
};
class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddComment(R"DOC(
CCommInit operator on NPU
Initialize collective communication context within this trainer
)DOC");
AddAttr<std::string>("rank_table_file",
"(string) path to rank_table_file");
AddAttr<int>("rank_id", "(int) world rank id of the process");
AddAttr<int>("device_id", "(int) device id of the process/thread");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpNPU,
ops::CCommInitOpNPUMaker);
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
class CCreateGroupOpNPU : public framework::OperatorBase {
public:
CCreateGroupOpNPU(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::string group_name = Attr<std::string>("group_name");
int nranks = Attr<int>("nranks");
std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
paddle::platform::HCCLCommContext::Instance().CreateHCCLGroup(
group_name, (uint32_t)nranks,
std::vector<uint32_t>(rank_ids.begin(), rank_ids.end()));
}
};
class CCreateGroupOpNPUMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddComment(R"DOC(
CCreateGroup operator on NPU
Create collective communication group on NPU
)DOC");
AddAttr<std::string>("group_name",
"(string) name of the collective communication group");
AddAttr<int>("nranks", "(int) number of the group");
AddAttr<std::vector<int>>("rank_ids",
"(list of int) The world rank id of the group members");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_create_group, ops::CCreateGroupOpNPU,
ops::CCreateGroupOpNPUMaker);
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_broadcast);
USE_OP(c_allreduce_sum);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_NO_KERNEL_OP(c_create_group);
USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
std::string rank_table_file = getenv("RANK_TABLE_FILE");
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
printf("rank_table_file: %s, rank_id = %d, device_id = %d\n", rank_table_file.c_str(), rank_id, device_id);
f::AttributeMap attrs;
attrs["rank_table_file"] = rank_table_file;
attrs["rank_id"] = rank_id;
attrs["device_id"] = device_id;
auto comm_init_op =
f::OpRegistry::CreateOp("c_comm_init_hccl", {}, {}, attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
f::AttributeMap create_attrs;
create_attrs["group_name"] = HCOM_GROUP_PREFIX + std::to_string(0);
create_attrs["nranks"] = 2;
std::vector<int> rank_ids{0, 1};
create_attrs["rank_ids"] = rank_ids;
auto create_group_op = f::OpRegistry::CreateOp("c_create_group", {}, {}, create_attrs);
create_group_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int num = 2;
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
std::cout<< "rank_id:" << rank_id<<std::endl;
for (int64_t i = 0; i < num * num; ++i) {
init.push_back(1.0 + rank_id);
std::cout<< init[0];
}
std::cout<<std::endl;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num, num});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"]=std::string("tagx");
attrs["root"]=0;
attrs["ring_id"]=0;
auto op =
f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 1.0);
}
}
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
std::cout<< "rank_id:" << rank_id<<std::endl;
int num1 = 1;
int num2 = 4;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0);
// init.push_back(1.0 + rank_id * 3);
std::cout<< init[0];
}
std::cout<<std::endl;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"]=std::string("tagx");
attrs["ring_id"]=0;
auto op =
f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0);
}
}
TEST(c_broadcast, NPU) {
f::Scope scope;
char * npu_id=getenv("FLAGS_selected_npus");
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
Prepare(&scope, ctx);
// TestHCCLBroadcastOp(&scope, ctx);
TestHCCLAllReduceOp(&scope, ctx);
}
...@@ -128,7 +128,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool ...@@ -128,7 +128,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce) cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc DEPS framework_proto device_context enforce)
if(WITH_GPU) if(WITH_GPU)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
......
...@@ -14,20 +14,21 @@ ...@@ -14,20 +14,21 @@
#pragma once #pragma once
#if defined(PADDLE_WITH_NCCL)
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "boost/variant.hpp" #include "boost/variant.hpp"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/hccl.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if defined(PADDLE_WITH_NCCL)
// In order to apply hierarchical communication with NCCL, we need // In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global // a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case, // ncclUniqueId. E.g. for a hierarchical case,
...@@ -47,6 +48,8 @@ namespace platform { ...@@ -47,6 +48,8 @@ namespace platform {
// //
// The NCCLComm instance is created and reversed in the NCCLCommContext // The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id. // singleton with a global user specified group id.
class CUDADeviceContext;
class NCCLComm { class NCCLComm {
public: public:
virtual int ring_id() const = 0; virtual int ring_id() const = 0;
...@@ -120,8 +123,162 @@ class NCCLCommContext { ...@@ -120,8 +123,162 @@ class NCCLCommContext {
NCCLCommContext() = default; NCCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(NCCLCommContext); DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
}; };
#endif
} // namespace platform #if defined(PADDLE_WITH_ASCEND_CL)
} // namespace paddle // In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class NPUDeviceContext;
class HCCLComm {
public:
virtual std::string rank_table_file() const = 0;
virtual uint32_t rank() const = 0;
virtual uint32_t device_id() const = 0;
virtual aclrtStream stream() const = 0;
virtual NPUDeviceContext* dev_context() const = 0;
virtual ~HCCLComm() = default;
};
// A singleton HCCL communicator context reserves communication ring ids
class HCCLCommContext {
public:
static HCCLCommContext& Instance() {
static HCCLCommContext comm_ctx;
return comm_ctx;
}
HCCLComm* CreateHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
void CreateHCCLGroup(const std::string& group_name, uint32_t nranks, const std::vector<uint32_t>& rank_ids);
// retrieve a communicator by the ring id and place
HCCLComm* Get() const {
return comm_.get();
}
private:
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
std::unique_ptr<HCCLComm> comm_;
HCCLComm* AssignHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
HCCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global
// BKCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The BKCLComm instance is created and reversed in the BKCLCommContext
// singleton with a global user specified group id.
class BKCLComm {
public:
virtual int ring_id() const = 0;
virtual int nranks() const = 0;
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual BKCLContext_t comm() const = 0;
virtual XPUStream stream() const = 0;
virtual XPUDeviceContext* dev_context() const = 0;
virtual ~BKCLComm() = default;
};
// A singleton BKCL communicator context reserves communication ring ids
class BKCLCommContext {
public:
static BKCLCommContext& Instance() {
static BKCLCommContext comm_ctx;
return comm_ctx;
}
BKCLComm* CreateBKCLComm(BKCLUniqueId* bkcl_id, int nranks, int rank,
int dev_id, int ring_id = 0);
void CreateAllBKCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
// a latter comm with the same dev_id and the same ring_id
// will override the former
BKCLComm* AssignBKCLComm(BKCLContext_t comm, int nranks, int rank, int dev_id,
int ring_id = 0);
// retrieve a communicator by the ring id in multiprocessing mode
BKCLComm* Get(int ring_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator in ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
platform::errors::InvalidArgument(
"One device id should be specified to retrieve from "
"multiple communicators."));
return comm_map_.at(ring_id).begin()->second.get();
}
// retrieve a communicator by the ring id and the device id
BKCLComm* Get(int ring_id, int dev_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator of ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_GT(
comm_map_.at(ring_id).count(dev_id), 0,
platform::errors::InvalidArgument(
"Communicator at device id %d has not been initialized in ring %d.",
dev_id, ring_id));
return comm_map_.at(ring_id).at(dev_id).get();
}
// retrieve a communicator by the ring id and place
BKCLComm* Get(int ring_id, Place place) const {
return Get(ring_id, BOOST_GET_CONST(XPUPlace, place).device);
}
private:
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
// ring id to dev-BKCLComm
std::map<int, std::map<int, std::unique_ptr<BKCLComm>>> comm_map_;
void ReleaseBKCLComms();
BKCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(BKCLCommContext);
};
#endif #endif
} // namespace platform
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace paddle {
namespace platform {
class HCCLCommImpl : public HCCLComm {
public:
void set_rank_table_file(const std::string& rank_table_file) { rank_table_file_ = rank_table_file; }
std::string rank_table_file() const override { return rank_table_file_; }
void set_rank(uint32_t rank) { rank_ = rank; }
uint32_t rank() const override { return rank_; }
void set_device_id(uint32_t device_id) { device_id_ = device_id; }
uint32_t device_id() const override { return device_id_; }
aclrtStream stream() const override { return dev_ctx_->stream(); }
void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
}
NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
private:
std::string rank_table_file_;
uint32_t rank_;
uint32_t device_id_;
std::unique_ptr<NPUDeviceContext> dev_ctx_;
};
HCCLComm* HCCLCommContext::CreateHCCLComm(const std::string& rank_table_file,
uint32_t rank, uint32_t device_id) {
/*
PADDLE_ENFORCE_NOT_NULL(rank_table_file,
platform::errors::InvalidArgument(
"The rank table file should not be null."));
PADDLE_ENFORCE_GE(rank, 0,
platform::errors::InvalidArgument(
"Expected rank >= 0. But received rank is %d.", rank));
PADDLE_ENFORCE_GE(device_id, 0,
platform::errors::InvalidArgument(
"Expected dev_id >= 0. But received dev_id is %d.", device_id));
*/
auto* comm_wrapper = AssignHCCLComm(rank_table_file, rank, device_id);
platform::dynload::hcom_init(rank_table_file.c_str(), std::to_string(rank).c_str());
platform::dynload::hcom_bind_model(comm_wrapper->stream(), comm_wrapper->stream());
VLOG(1) << "hccl communicator of rank " << rank << " has been created";
return comm_wrapper;
}
HCCLComm* HCCLCommContext::AssignHCCLComm(const std::string& rank_table_file,
uint32_t rank, uint32_t device_id) {
std::unique_ptr<NPUDeviceContext> dev_ctx(
new NPUDeviceContext(NPUPlace(device_id)));
VLOG(3) << "device_id" << device_id;
VLOG(3) << "dev_ctx->stream()" << dev_ctx->stream();
HCCLCommImpl* c = new HCCLCommImpl;
c->set_rank_table_file(rank_table_file);
c->set_rank(rank);
c->set_device_id(device_id);
c->set_dev_ctx(std::move(dev_ctx));
// comm_ = c
comm_.reset(c);
return c;
}
void HCCLCommContext::CreateHCCLGroup(const std::string& group_name, uint32_t nranks,
const std::vector<uint32_t>& rank_ids) {
/*
PADDLE_ENFORCE_NOT_NULL(group_name,
platform::errors::InvalidArgument(
"The group name should not be null."));
PADDLE_ENFORCE_GT(nranks, 0,
platform::errors::InvalidArgument(
"Expected nranks > 0. But received nranks is %d.", nranks));
PADDLE_ENFORCE_NOT_NULL(rank_ids,
platform::errors::InvalidArgument(
"The rank ids should not be null."));
*/
platform::dynload::hcom_create_group(group_name.c_str(), nranks, (unsigned int*)rank_ids.data());
VLOG(1) << "hccl group with name " << group_name << " has been created";
}
} // namespace platform
} // namespace paddle
#endif
...@@ -9,7 +9,7 @@ endif() ...@@ -9,7 +9,7 @@ endif()
# There is no macOS version of NCCL. # There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
if (NOT APPLE AND NOT WIN32) if (NOT APPLE AND NOT WIN32)
list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
if (WITH_NCCL) if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc) list(APPEND CUDA_SRCS nccl.cc)
endif() endif()
...@@ -32,6 +32,8 @@ endif(CUPTI_FOUND) ...@@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
if(WITH_ROCM_PLATFORM) if(WITH_ROCM_PLATFORM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
else() else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
......
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file base.h
* @brief HCOM data type definition
*
*/
#ifndef HCCL_BASE_H_
#define HCCL_BASE_H_
#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef signed char s8;
typedef signed short s16;
typedef signed int s32;
typedef signed long long s64;
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long long u64;
/**
* @brief HCOM functions return value definition
*/
typedef enum tagHcclResult {
HCCL_SUCCESS = 0, /**< success */
HCCL_E_PARA = 1, /**< parameter error */
HCCL_E_PTR = 2, /**< empty pointer */
HCCL_E_MEMORY = 3, /**< memory error */
HCCL_E_INTERNAL = 4, /**< internal error */
HCCL_E_NOT_SUPPORT = 5, /**< not support feature */
HCCL_E_NOT_FOUND = 6, /**< not found specific resource */
HCCL_E_UNAVAIL = 7, /**< resource unavailable */
HCCL_E_SYSCALL = 8, /**< call system interface error */
HCCL_E_TIMEOUT = 9, /**< timeout */
HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */
HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */
HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */
HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */
HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */
HCCL_E_RUNTIME = 15, /**< call runtime api fail */
HCCL_E_DRV = 16, /**< call driver api fail */
HCCL_E_PROFILING = 17, /**< call profiling api fail */
HCCL_E_CCE = 18, /**< call cce api fail */
HCCL_E_NETWORK = 19, /**< call network api fail */
HCCL_E_RESERVED /**< reserved */
} hcclResult_t;
/* handle to communicator */
typedef void *hcclComm_t;
/**
* @brief HCCL Reduction opperation
*/
typedef enum tagHcclRedOp {
HCCL_REP_OP_SUM = 0, /**< sum */
HCCL_REP_OP_PROD = 1, /**< prod */
HCCL_REP_OP_MAX = 2, /**< max */
HCCL_REP_OP_MIN = 3, /**< min */
HCCL_REP_OP_RESERVED /**< reserved */
} hcclRedOp_t;
/**
* @brief HCCL data type
*/
typedef enum tagHcclDataType {
HCCL_DATA_TYPE_INT8 = 0, /**< int8 */
HCCL_DATA_TYPE_INT16 = 1, /**< int16 */
HCCL_DATA_TYPE_INT32 = 2, /**< int32 */
HCCL_DATA_TYPE_FP16 = 3, /**< fp16 */
HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */
HCCL_DATA_TYPE_INT64 = 5, /**< fp32 */
HCCL_DATA_TYPE_UINT64 = 6, /**< fp32 */
HCCL_DATA_TYPE_RESERVED /**< reserved */
} hcclDataType_t;
const u32 HCCL_MAX_SEGMENT_NUM = 8; // The max number of gradient segments.
/**
* @brief the feature of the model
*/
struct model_feature {
const char *model_name; /**< The model name */
u32 gradient_num; /**< The number of gradients */
float *gradient_size; /**< The size of each gradient */
float *gradient_time; /**< The BP compution time of each gradient */
};
enum GradSplitForceMode {
FORCE_NONE, /**< no force */
FORCE_SIZE, /**< force split gradient by size */
FORCE_RESERVED /**< reserved */
};
/**
* @brief stream handle.
*/
typedef void *rtStream_t;
/**
* @brief model handle.
*/
typedef void *rtModel_t;
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // HCCL_BASE_H_
...@@ -21,6 +21,10 @@ limitations under the License. */ ...@@ -21,6 +21,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti_lib_path.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DEFINE_string(cudnn_dir, "", DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, " "Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen " "/usr/local/cudnn/lib. If empty [default], dlopen "
...@@ -36,6 +40,11 @@ DEFINE_string(nccl_dir, "", ...@@ -36,6 +40,11 @@ DEFINE_string(nccl_dir, "",
"For instance, /usr/local/cuda/lib64. If default, " "For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"); "dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(hccl_dir, "",
"Specify path for loading hccl library, such as libhccl.so. "
"For instance, /usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If default, "
"dlopen will search hccl from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string( DEFINE_string(
...@@ -383,6 +392,26 @@ void* GetNCCLDsoHandle() { ...@@ -383,6 +392,26 @@ void* GetNCCLDsoHandle() {
warning_msg); warning_msg);
#endif #endif
} }
void* GetHCCLDsoHandle() {
std::string warning_msg(
"You may need to install 'hccl2' from Huawei official website: "
"before install PaddlePaddle.");
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
#elif defined(PADDLE_WITH_ASCEND_CL)
return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
warning_msg);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
#endif
}
void* GetTensorRtDsoHandle() { void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
......
...@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle(); ...@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
void* GetCUDADsoHandle(); void* GetCUDADsoHandle();
void* GetWarpCTCDsoHandle(); void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetHCCLDsoHandle();
void* GetTensorRtDsoHandle(); void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle(); void* GetMKLMLDsoHandle();
void* GetOpDsoHandle(const std::string& dso_name); void* GetOpDsoHandle(const std::string& dso_name);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hccl.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hccl_dso_flag;
void *hccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if HCCL_VERSION_CODE >= 2212
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// #include <hccl/hccl.h>
// #include <hccl/hccl_types.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/hcom.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hccl_dso_flag;
extern void* hccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using HCCL_func = decltype(&::__name); \
std::call_once(hccl_dso_flag, []() { \
hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(hccl_dso_handle, #__name); \
return reinterpret_cast<HCCL_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HCCL_RAND_ROUTINE_EACH(__macro) \
__macro(hcom_init); \
__macro(hcom_destroy); \
__macro(hcom_bind_model); \
__macro(hcom_unbind_model); \
__macro(hcom_send); \
__macro(hcom_receive); \
__macro(hcom_broadcast); \
__macro(hcom_all_gather); \
__macro(hcom_all_reduce); \
__macro(hcom_reduce_scatter); \
__macro(hcom_create_group); \
__macro(hcom_destroy_group); \
__macro(hcom_get_rank_id); \
__macro(hcom_get_local_rank_id); \
__macro(hcom_get_local_rank_size); \
__macro(hcom_get_split_strategy); \
__macro(hcom_set_split_strategy_by_size); \
__macro(hcom_set_split_strategy_by_index); \
__macro(hcom_get_group_rank_from_world_rank); \
__macro(hcom_get_world_rank_from_group_rank);
HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#if HCCL_VERSION_CODE >= 2212
#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(HCCLSend); \
__macro(HCCLRecv);
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file hcom.h
* @brief HCOM API
*/
#ifndef HCOM_H_
#define HCOM_H_
// #include <runtime/rt.h>
#include "paddle/fluid/platform/dynload/base.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/**
* @brief Initialize HCOM.
*
* @param rank_table A string identifying the rank table file path, include file name.
* @param identify A string identifying the identify for the rank.
* @return hcclResult_t
* @see hcom_destroy()
*/
extern hcclResult_t hcom_init(const char *rank_table, const char *identify);
/**
* @brief Destroy HCOM
*
* @return hcclResult_t
* @see hcom_init()
*/
extern hcclResult_t hcom_destroy(void);
/**
* @brief Bind the model.
*
* @param model A pointer identifying the model information.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
* @see hcom_unbind_model()
*/
extern hcclResult_t hcom_bind_model(rtModel_t model, rtStream_t stream);
/**
* @brief Unbind the model.
*
* @param model An pointer identifying the model information.
* @return hcclResult_t
* @see hcom_unbind_model()
*/
extern hcclResult_t hcom_unbind_model(rtModel_t model);
/**
* @brief All-gather operator.
*
* @param tag A string identifying the tag of the operator.
* @param inputPtr A pointer identifying the input data address of the operator.
* @param outputPtr A pointer identifying the output data address of the operator.
* @param inputCount An integer(u64) identifying the number of the input data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
extern hcclResult_t hcom_all_gather(const char *tag, void *inputPtr, void *outputPtr, u64 inputCount,
hcclDataType_t dataType, const char *group, rtStream_t stream);
/**
* @brief All-reduce operator.
*
* @param tag A string identifying the tag of the operator.
* @param inputPtr A pointer identifying the input data address of the operator.
* @param outputPtr A pointer identifying the output data address of the operator.
* @param count An integer(u64) identifying the number of the output data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
extern hcclResult_t hcom_all_reduce(const char *tag, void *inputPtr, void *outputPtr, u64 count,
hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream);
/**
* @brief Broadcast operator.
*
* @param tag A string identifying the tag of the operator.
* @param ptr A pointer identifying the data address of the operator.
* @param count An integer(u64) identifying the number of the data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param root An integer(u32) identifying the the root rank in the operator.
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
extern hcclResult_t hcom_broadcast(const char *tag, void *ptr, u64 count, hcclDataType_t dataType, u32 root,
const char *group, rtStream_t stream);
/**
* @brief Reduce-scatter operator.
*
* @param tag A string identifying the tag of the operator.
* @param inputPtr A pointer identifying the input data address of the operator.
* @param outputPtr A pointer identifying the output data address of the operator.
* @param count An integer(u64) identifying the number of the data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
extern hcclResult_t hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, u64 count,
hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream);
/**
* @brief Get the rank number in the group.
*
* @param group A string identifying the group name.
* @param rankSize A pointer identifying the rank number.
* @return hcclResult_t
*/
hcclResult_t hcom_get_rank_size(const char *group, u32 *rankSize);
/**
* @brief Get the rank number of this rank's server within the group.
*
* @param group A string identifying the group name.
* @param localRankSize A pointer identifying the rank number.
* @return hcclResult_t
*/
hcclResult_t hcom_get_local_rank_size(const char *group, u32 *localRankSize);
/**
* @brief Get the rank id of this rank.
*
* @param group A string identifying the group name.
* @param rankId A pointer identifying the rank id.
* @return hcclResult_t
*/
hcclResult_t hcom_get_rank_id(const char *group, u32 *rankId);
/**
* @brief Get the local rank id of this rank's server within the group.
*
* @param group A string identifying the group name.
* @param localRankId A pointer identifying the local rank id.
* @return hcclResult_t
*/
hcclResult_t hcom_get_local_rank_id(const char *group, u32 *localRankId);
/**
* @brief Get the world rank id according to the group rank id.
*
* @param group A string identifying the group name.
* @param groupRank An integer(u32) identifying the group rank id.
* @param worldRank A pointer identifying the world rank id.
* @return hcclResult_t
*/
hcclResult_t hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank);
/**
* @brief Get the group rank id according to the world rank id.
*
* @param worldRank An integer(u32) identifying the world rank id.
* @param group A string identifying the group name.
* @param groupRank A pointer identifying the group rank id.
* @return hcclResult_t
*/
hcclResult_t hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank);
/**
* @brief Create group.
*
* @param group A string identifying the group name.
* @param rankNum An integer(u32) identifying the number of ranks in the group.
* @param rankIds A list identifying the ranks in the group.
* @return hcclResult_t
*/
hcclResult_t hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
/**
* @brief Destroy group
*
* @param group A string identifying the group name.
* @return hcclResult_t
*/
hcclResult_t hcom_destroy_group(const char *group);
/**
* @brief Send operator.
*
* @param tag A string identifying the tag of the operator.
* @param inputPtr A pointer identifying the input data address of the operator.
* @param count An integer(u64) identifying the number of the data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param destRank An integer identifying the destination rank.
* @param srTag An integer identifying the send/recv message tag.
* The message will be send by the receive operator with the same "sr_tag".
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
hcclResult_t hcom_send(const char *tag, void *inputPtr, u64 count, hcclDataType_t dataType,
u32 destRank, u32 srTag, const char *group, rtStream_t stream);
/**
* @brief Receive operator.
*
* @param tag A string identifying the tag of the operator.
* @param outputPtr A pointer identifying the output data address of the operator.
* @param count An integer(u64) identifying the number of the data.
* @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
* @param srcRank An integer identifying the source rank.
* @param srTag An integer identifying the send/recv message tag.
* The message will be send by the send operator with the same "sr_tag".
* @param group A string identifying the group name of ranks participating in the operator.
* @param stream A pointer identifying the stream information.
* @return hcclResult_t
*/
hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataType_t dataType,
u32 srcRank, u32 srTag, const char *group, rtStream_t stream);
/**
* @brief Get the gradient split strategy with in the group.
*
* @param group A string identifying the group name.
* @param feature A pointer identifying the feature of the model.
* @param maxSegmentNum An integer(u32) identifying the max segments of gradients.
* @param segmentNum A pointer identifying the segments number of gradients.
* @param segmentIdx A list identifying the index of end gradient in each segment.
* @return hcclResult_t
*/
hcclResult_t hcom_get_split_strategy(const char *group, const struct model_feature *feature,
u32 maxSegmentNum, u32 *segmentNum, u32 *segmentIdx, GradSplitForceMode force = FORCE_NONE);
/**
* @brief Set the gradient split strategy with in the group, according to gradient index.
*
* @param group A string identifying the group name.
* @param segmentNum An integer(u32) identifying the segments number of gradients.
* @param IdxList A list identifying the index of end gradient in each segment.
* @return hcclResult_t
*/
extern hcclResult_t hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList);
/**
* @brief Set the gradient split strategy with in the group, according to gradient data size.
*
* @param group A string identifying the group name.
* @param segmentNum An integer(u32) identifying the segments number of gradients.
* @param sizeList A list identifying the percent of each segment.
* @return hcclResult_t
*/
extern hcclResult_t hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // HCOM_H_
...@@ -40,6 +40,7 @@ limitations under the License. */ ...@@ -40,6 +40,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h" #include "acl/acl.h"
#include "paddle/fluid/platform/dynload/hcom.h"
#endif // PADDLE_WITH_ASCEND_CL #endif // PADDLE_WITH_ASCEND_CL
#include <fstream> #include <fstream>
...@@ -1012,6 +1013,7 @@ struct NPUStatusType {}; ...@@ -1012,6 +1013,7 @@ struct NPUStatusType {};
} }
DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
DEFINE_NPU_STATUS_TYPE(hcclResult_t, HCCL_SUCCESS);
} // namespace details } // namespace details
inline std::string build_npu_error_msg(aclError stat) { inline std::string build_npu_error_msg(aclError stat) {
...@@ -1020,6 +1022,13 @@ inline std::string build_npu_error_msg(aclError stat) { ...@@ -1020,6 +1022,13 @@ inline std::string build_npu_error_msg(aclError stat) {
return sout.str(); return sout.str();
} }
inline std::string build_npu_error_msg(hcclResult_t stat) {
std::ostringstream sout;
sout << " HCCL error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ #define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
do { \ do { \
auto __cond__ = (COND); \ auto __cond__ = (COND); \
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL)
#include <stdio.h>
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <typeindex>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/collective_helper.h"
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/dynload/hccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#define NCCL_ID_VARNAME "NCCLID"
namespace paddle {
namespace platform {
inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
if (type == framework::proto::VarType::FP32) {
return HCCL_DATA_TYPE_FP32;
} else if (type == framework::proto::VarType::FP16) {
return HCCL_DATA_TYPE_FP16;
}else if (type == framework::proto::VarType::INT32) {
return HCCL_DATA_TYPE_INT32;
} else if (type == framework::proto::VarType::INT8) {
return HCCL_DATA_TYPE_INT8;
}
// else if (type == framework::proto::VarType::FP64) {
// return HCCL_DATA_TYPE_FP32;
// }
else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in hccl is not supported."));
}
}
// // NOTE(minqiyang): according to the ncclGroupEnd documentations:
// // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// // ncclGroupEnd will wait for all communicators to be initialized, which will
// // cause blocking problem when a runtime_error was thrown, so try only guard
// // NCCL actions when use it.
// class NCCLGroupGuard {
// public:
// static std::mutex &NCCLMutex() {
// static std::mutex mtx;
// return mtx;
// }
// inline NCCLGroupGuard() {
// NCCLMutex().lock();
// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
// }
// inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
// NCCLMutex().unlock();
// }
// };
// struct NCCLContext {
// std::unique_ptr<CUDADeviceContext> ctx_;
// ncclComm_t comm_;
// explicit NCCLContext(int dev_id)
// : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
// gpuStream_t stream() const { return ctx_->stream(); }
// ncclComm_t comm() const { return comm_; }
// int device_id() const {
// return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device;
// }
// };
// struct NCCLContextMap {
// std::unordered_map<int, NCCLContext> contexts_;
// std::vector<int> order_;
// explicit NCCLContextMap(const std::vector<platform::Place> &places,
// ncclUniqueId *nccl_id = nullptr,
// size_t num_trainers = 1, size_t trainer_id = 0) {
// PADDLE_ENFORCE_EQ(!places.empty(), true,
// platform::errors::InvalidArgument(
// "The NCCL place should not be empty."));
// order_.reserve(places.size());
// for (auto &p : places) {
// int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
// order_.emplace_back(dev_id);
// contexts_.emplace(dev_id, NCCLContext(dev_id));
// }
// PADDLE_ENFORCE_EQ(
// order_.size(), contexts_.size(),
// platform::errors::Unavailable("NCCL Context Map does not support "
// "contain two or more same device."));
// std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
// // if num_trainers == 1, should create a new nccl id for local comms.
// if (num_trainers == 1 && nccl_id == nullptr) {
// std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
// PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
// comms.get(), static_cast<int>(order_.size()), order_.data()));
// } else {
// PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
// "The NCCL id should not be null."));
// {
// int nranks = num_trainers * order_.size();
// NCCLGroupGuard gurad;
// for (size_t i = 0; i < order_.size(); ++i) {
// int gpu_id = order_[i];
// int rank;
// if (order_.size() > 1) {
// rank = trainer_id * order_.size() + i;
// } else {
// rank = trainer_id;
// }
// VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
// << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
// SetDeviceId(gpu_id);
// PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
// comms.get() + i, nranks, *nccl_id, rank));
// }
// }
// }
// int i = 0;
// for (auto &dev_id : order_) {
// contexts_.at(dev_id).comm_ = comms[i++];
// }
// }
// NCCLContextMap(const NCCLContextMap &other) = delete;
// NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
// CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
// CUDADeviceContext *DevCtx(platform::Place p) const {
// return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device);
// }
// const NCCLContext &at(platform::Place p) const {
// return this->at(BOOST_GET_CONST(CUDAPlace, p).device);
// }
// const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
// void WaitAll() {
// for (auto &p : contexts_) {
// p.second.ctx_->Wait();
// }
// }
// };
// inline std::string GetFlatNCCLVarName(size_t pos) {
// if (pos == 0) {
// return NCCL_ID_VARNAME;
// }
// return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast<int>(pos));
// }
// inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
// return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME,
// static_cast<int>(pos));
// }
// inline std::string GetHierarchicalInterNCCLVarName(size_t pos) {
// return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME,
// static_cast<int>(pos));
// }
// class NCCLCommunicator {
// public:
// NCCLCommunicator() {}
// virtual ~NCCLCommunicator() PADDLE_MAY_THROW {}
// NCCLContextMap *DefaultFlatCtx() const {
// if (flat_ctxs_.size() == 0) {
// return nullptr;
// }
// return flat_ctxs_[0].get();
// }
// std::vector<std::unique_ptr<NCCLContextMap>> *GetFlatCtxs() {
// return &flat_ctxs_;
// }
// NCCLContextMap *GetFlatCtx(size_t run_order) const {
// return flat_ctxs_[run_order % flat_ctxs_.size()].get();
// }
// NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order,
// bool use_hierarchical_allreduce) const {
// if (!use_hierarchical_allreduce) {
// return GetFlatCtx(run_order);
// }
// return GetHierarchicalInterCtx(run_order);
// }
// *When nccl inits nccl comm using ncclCommInitAll, it meets error when
// *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
// *create a new nccl comm for sync_batch_norm_op. And these codes should be
// *polished with a unified nccl management.
// NCCLContextMap *GetSyncBatchNormCtx(
// framework::Scope *scope, const std::vector<platform::Place> &places) {
// auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
// if (nccl_id_var != nullptr) {
// return DefaultFlatCtx();
// }
// if (sync_batch_norm_ctx_.get() == nullptr) {
// sync_batch_norm_ctx_.reset(new NCCLContextMap(places));
// }
// return sync_batch_norm_ctx_.get();
// }
// void InitFlatCtxs(const std::vector<platform::Place> &places,
// const std::vector<ncclUniqueId *> &nccl_ids,
// size_t trainers_num, size_t trainer_id) {
// if (nccl_ids.size() == 0) {
// auto ptr = new platform::NCCLContextMap(places);
// VLOG(1) << "init local trainer";
// flat_ctxs_.emplace_back(ptr);
// } else {
// for (size_t i = 0; i < nccl_ids.size(); i++) {
// auto ptr = new platform::NCCLContextMap(places, nccl_ids[i],
// trainers_num, trainer_id);
// VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
// flat_ctxs_.emplace_back(ptr);
// }
// }
// // as Executor have no way to use ncclComm created by ParallelExecutor,
// // we assign all flatten contexts to NCCLCommContext to fix.
// int nranks = static_cast<int>(trainers_num * places.size());
// int nrings = static_cast<int>(flat_ctxs_.size());
// for (int ring_id = 0; ring_id < nrings; ++ring_id) {
// for (size_t p = 0; p < places.size(); ++p) {
// int rank = trainer_id * places.size() + p;
// int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device;
// auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
// NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank,
// dev_id, ring_id);
// }
// }
// }
// void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
// const std::vector<ncclUniqueId *> &inter_nccl_ids,
// const std::vector<ncclUniqueId *> &exter_nccl_ids,
// size_t trainers_num, size_t trainer_id,
// size_t inter_trainers_num,
// size_t exter_trainers_num) {
// PADDLE_ENFORCE_EQ(
// trainers_num, inter_trainers_num * exter_trainers_num,
// platform::errors::InvalidArgument(
// "trainers_num:%llu != inter_trainers_num:%llu * "
// "exter_trainers_num:%llu",
// trainers_num, inter_trainers_num, exter_trainers_num));
// PADDLE_ENFORCE_GT(
// inter_trainers_num, 1,
// platform::errors::InvalidArgument(
// "The inter_trainers_num:%llu should be larger than 1.",
// inter_trainers_num));
// int inter_trainer_id = trainer_id % inter_trainers_num;
// for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
// VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
// << ", comm no:" << i;
// auto local = new NCCLContextMap(places, inter_nccl_ids[i],
// inter_trainers_num, inter_trainer_id);
// h_inter_ctxs_.emplace_back(local);
// }
// int exter_trainer_id = -1;
// if (trainer_id % inter_trainers_num == 0) {
// exter_trainer_id = trainer_id / inter_trainers_num;
// }
// if (exter_trainer_id >= 0) {
// for (size_t i = 0; i < exter_nccl_ids.size(); i++) {
// auto ex = new NCCLContextMap(places, exter_nccl_ids[i],
// exter_trainers_num, exter_trainer_id);
// VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
// << ", comm no:" << i;
// h_exter_ctxs_.emplace_back(ex);
// }
// }
// }
// bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
// NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
// PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
// platform::errors::InvalidArgument(
// "Hierarchical ctxs should be initialized firstly!"));
// return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
// }
// NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
// PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
// platform::errors::InvalidArgument(
// "Hierarchical ctxs should be initialized firstly!"));
// return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
// }
// std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalInterCtxs() {
// return &h_inter_ctxs_;
// }
// std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalExterCtxs() {
// return &h_exter_ctxs_;
// }
// protected:
// // Support multi nccl comm on default nccl ring while NCCLContextMap can't.
// std::vector<std::unique_ptr<NCCLContextMap>> flat_ctxs_;
// // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
// // And h_exter_ctxs_ can support multi comm too.
// std::vector<std::unique_ptr<NCCLContextMap>> h_inter_ctxs_;
// std::vector<std::unique_ptr<NCCLContextMap>> h_exter_ctxs_;
// // just used for sync_batch_norm op.
// std::unique_ptr<NCCLContextMap> sync_batch_norm_ctx_;
// };
} // namespace platform
} // namespace paddle
#endif
...@@ -326,7 +326,7 @@ GenerateOpFunctions(const std::string& module_name) { ...@@ -326,7 +326,7 @@ GenerateOpFunctions(const std::string& module_name) {
} }
ins_initializer += "}"; ins_initializer += "}";
if (input_args.back() == ',') { if (!input_args.empty() && input_args.back() == ',') {
input_args.pop_back(); input_args.pop_back();
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册