未验证 提交 80dd1672 编写于 作者: 张春乔 提交者: GitHub

mv PADDLE_WITH_ASCEND_CL (#52535)

上级 29c28e2f
......@@ -52,7 +52,7 @@ void MessageBus::Init(
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
// NOTE: To make the brpc is compatible with collective,
// need release the handler holding the ip address.
if (addr_ != "") {
......
......@@ -2128,12 +2128,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
// CPUKernel will be executed and a warning will be given at the same
// time.
expected_kernel_key.place_ = platform::CPUPlace();
#ifdef PADDLE_WITH_ASCEND_CL
if (SupportNPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
}
#endif
if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_
......@@ -2305,16 +2300,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
platform::is_mlu_place(expected_kernel_key.place_)) {
......
......@@ -150,15 +150,6 @@ AmpOperators::AmpOperators()
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL)
auto unsupported_ops_npu_fp16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_npu_fp16.begin(),
unsupported_ops_npu_fp16.end());
auto unsupported_ops_npu_bf16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_npu_bf16.begin(),
unsupported_ops_npu_bf16.end());
#elif defined(PADDLE_WITH_XPU)
auto unsupported_ops_xpu_fp16 = std::get<2>(
OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
......
......@@ -34,8 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
......@@ -270,32 +268,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
#endif
}
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place)) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<platform::NPUDeviceContext*>(ctx);
if (data_type == framework::DataTypeTrait<float>::DataType()) {
dst_tensor->mutable_data<float>(place);
} else if (data_type == framework::DataTypeTrait<double>::DataType()) {
dst_tensor->mutable_data<double>(place);
} else if (data_type ==
framework::DataTypeTrait<platform::float16>::DataType()) {
dst_tensor->mutable_data<platform::float16>(place);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type),
place));
}
const auto& runner = operators::NpuOpRunner(
"Add", {*dst_tensor, src_tensor}, {*dst_tensor}, {});
runner.Run(dev_ctx->stream());
return;
}
#endif
#ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place)) {
if (data_type == framework::DataTypeTrait<float>::DataType()) {
......
......@@ -12,62 +12,3 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/parallel_context.h"
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace imperative {
class HCCLParallelContext : public ParallelContext {
public:
explicit HCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~HCCLParallelContext() override = default;
void BcastHCCLId(const std::vector<HcclRootInfo>& hccl_ids,
int root, // NOLINT
int server_fd);
void Init() override;
void InitWithRingID(int ring_id) override;
void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst,
int ring_id,
bool use_calc_stream) override;
void Broadcast(framework::Variable* src, int ring_id) override;
paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
void WaitCompute(int ring_id) override;
void WaitComm(int ring_id) override;
void SynchronizeCompute() override;
private:
// used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
std::vector<std::shared_ptr<platform::NpuStreamObject>> compute_events_;
// used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
std::vector<std::shared_ptr<platform::NpuEventObject>> comm_events_;
};
} // namespace imperative
} // namespace paddle
#endif
......@@ -42,8 +42,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
: ParallelContext(strategy, platform::CUDAPlace(device_id))
#elif PADDLE_WITH_XPU_BKCL
: ParallelContext(strategy, platform::XPUPlace(device_id))
#elif PADDLE_WITH_ASCEND_CL
: ParallelContext(strategy, platform::NPUPlace(device_id))
#else
: ParallelContext(strategy, platform::CPUPlace())
#endif
......@@ -112,11 +110,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
node_parallel_ctx_ =
std::make_shared<BKCLParallelContext>(node_strategy_, node_place_);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
node_place_ = platform::NPUPlace(device_id);
node_parallel_ctx_ =
std::make_shared<HCCLParallelContext>(node_strategy_, node_place_);
#endif
}
void HeterParallelContext::Init() {
......
......@@ -24,11 +24,6 @@
#ifdef PADDLE_WITH_XPU_BKCL
#include "paddle/fluid/imperative/bkcl_context.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/imperative/hccl_context.h"
#endif
#include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/parallel_context.h"
......
......@@ -458,17 +458,6 @@ PreparedOp PrepareImpl(
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
paddle::platform::is_npu_place(fluid_kernel_type.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << fluid_kernel_type
<< ", fallbacking to CPU one!";
fluid_kernel_type.place_ = platform::CPUPlace();
kernel_iter = kernels.find(fluid_kernel_type);
}
#endif
#ifdef PADDLE_WITH_IPU
if (kernel_iter == kernels.end() &&
paddle::platform::is_ipu_place(fluid_kernel_type.place_)) {
......
......@@ -31,7 +31,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
// div the nranks
void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
phi::DenseTensor *tensor =
......@@ -305,17 +305,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support."));
#endif
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
ConcatTensorsWithType(
static_cast<const platform::NPUDeviceContext &>(context),
dense_tensors_,
&dense_contents_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType(
......@@ -365,17 +358,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support."));
#endif
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
SplitTensorsWithType(
static_cast<const platform::NPUDeviceContext &>(context),
&dense_contents_,
&dense_tensors_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_CNCL
SplitTensorsWithType(
......@@ -1129,9 +1115,8 @@ void Reducer::FinalizeBackward() {
if (find_unused_vars_each_step_) {
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
ProcessUnusedDenseVars();
#endif
// Initialize local used vars
......
......@@ -46,7 +46,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
template <typename T>
struct DivNRanksFunctor {
......
......@@ -135,15 +135,10 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
gc.reset(new framework::CPUGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
} else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use NPU device since it's not compiled with NPU,"
"Please recompile or reinstall Paddle with NPU support."));
#endif
} else if (platform::is_ipu_place(place)) {
#if defined(PADDLE_WITH_IPU)
gc.reset(new framework::IPUGarbageCollector(place, 0));
......@@ -303,12 +298,8 @@ void Tracer::TraceOpImpl(const std::string& type,
"PaddlePaddle should compile with XPU if use XPUPlace."));
#endif
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
platform::SetNPUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId(place.device);
......
......@@ -49,12 +49,6 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator.");
#ifdef PADDLE_WITH_ASCEND_CL
AddInput("FloatStatus",
"(Tensor) 1-dim tensor of shape [8], allocated by "
"alloc_float_status op")
.AsDispensable();
#endif
AddOutput("Out",
"(Tensors) The scaled output tensor of "
"check_finite_and_unscale operator.")
......
......@@ -57,26 +57,7 @@ struct FillConstantVisitor {
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type
* = nullptr) const {
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(dev_ctx_.GetPlace())) {
phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
const auto &runner =
NpuOpRunner("FillD",
{tensor_tmp},
{*tensor_},
{{"dims", phi::vectorize(tensor_->dims())}});
auto stream =
context_.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
} else {
phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
#elif defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
if (platform::is_mlu_place(context_.GetPlace())) {
FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
} else {
......@@ -235,12 +216,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
// Init the continuous space
size_t offset = 0;
if (context.Attr<bool>("copy_data")) {
#ifdef PADDLE_WITH_ASCEND_CL
framework::VisitDataType(
dtype,
FillConstantVisitor<DeviceContext>(
dev_ctx, fused_tensor, static_cast<float>(0.0), dtype, context));
#endif
for (size_t i = 0; i < in_var_names.size(); ++i) {
size_t len = static_cast<size_t>(in_tensors[i]->numel());
auto sub_tensor = fused_tensor->Slice(
......@@ -534,25 +509,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_NPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
#endif
#if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL(
coalesce_tensor,
......
......@@ -44,10 +44,6 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allgather result");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -16,10 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -27,51 +23,8 @@ template <typename T>
class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
framework::DDim out_dims = in->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);
uint64_t send_numel = in->numel();
void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
void *recv_buff = reinterpret_cast<void *>(out->data<T>());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group << ", ring_id is " << ring_id
<< ", nranks is " << nranks;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllGather(send_buff,
recv_buff,
send_numel,
dtype,
comm->comm(),
reinterpret_cast<void *>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -24,9 +24,8 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
......@@ -44,17 +43,10 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool(hccl_check_nan);
#endif
namespace paddle {
namespace operators {
......@@ -150,177 +142,12 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_nan or return false;
inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
aclrtStream stream,
const phi::DenseTensor* in) {
phi::DenseTensor out(in->type());
phi::DenseTensor mean(in->type());
mean.Resize({1});
mean.mutable_data<float>(dev_ctx.GetPlace());
std::vector<int> axes;
for (int i = 0; i < in->dims().size(); ++i) {
axes.push_back(i);
}
std::vector<float> vec;
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
paddle::framework::TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
return true;
}
VLOG(4) << "reducemeand result:" << vec[0];
if (std::isnan(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects nan";
return true;
}
if (std::isinf(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects inf";
}
return false;
}
#endif
template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
if (ctx.HasInput("Cond")) {
auto cond = ctx.Input<phi::DenseTensor>("Cond");
auto place = cond->place();
PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The input `cond` tensor should be on cpu place"));
PADDLE_ENFORCE_EQ(cond->numel(),
1,
platform::errors::PreconditionNotMet(
"The input `cond` should be shape [1]"));
if (!cond->data<bool>()[0]) {
VLOG(4) << "Skip all reduce Op since cond is 0";
return;
}
}
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
out->mutable_data<T>(in->dims(), ctx.GetPlace());
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
if (ctx.Attr<bool>("use_calc_stream")) {
stream = dev_ctx->stream();
} else {
stream = comm->stream();
}
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "hccl allreduce, parameter is: "
<< "input num: " << in->dims() << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group
<< ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
<< ", out_size:" << out->memory_size()
<< ", use_calc_stream:" << ctx.Attr<bool>("use_calc_stream")
<< ", stream:" << stream;
phi::DenseTensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());
bool found_nan = false;
auto d_type = framework::TransToProtoVarType(in->dtype());
switch (d_type) {
case framework::proto::VarType::FP16: {
break;
}
case framework::proto::VarType::FP32: {
if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf";
// NOTE: performance relating, DO NOT REMOVE!
ContainsNan(*dev_ctx, dev_ctx->stream(), in);
}
break;
}
default:
break;
}
if (found_nan) {
T inf = static_cast<T>(std::numeric_limits<float>::infinity());
VLOG(4) << "fill input data constant inf";
auto dims = in->dims();
auto mutable_in = const_cast<phi::DenseTensor*>(in);
FillNpuTensorWithConstant<T>(mutable_in, inf);
mutable_in->Resize(dims);
}
VLOG(3) << "hccl allreduce, parameter is: "
<< "input num: " << numel << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group
<< ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
<< ", out_size:" << out->memory_size();
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
hccl_red_type,
comm->comm(),
reinterpret_cast<void*>(stream)));
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......@@ -616,10 +443,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
......
......@@ -42,10 +42,7 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -25,58 +21,8 @@ template <typename T>
class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
auto out = ctx.Output<phi::DenseTensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
VLOG(3) << "begin hccl broadcast, parameter is: "
<< "root " << root << ", group is " << group
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
<< phi::product(out->dims());
dev_ctx->Wait();
if (out != x) {
framework::TensorCopy(*static_cast<const phi::DenseTensor*>(x),
place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<phi::DenseTensor*>(out));
}
dev_ctx->Wait();
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -21,11 +21,6 @@ namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
#if defined(PADDLE_WITH_ASCEND_CL)
#include "hccl/hccl.h"
#include "hccl/hccl_types.h"
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -48,52 +43,9 @@ class CCommInitOpAscend : public framework::OperatorBase {
auto var = scope.FindVar(Input("X"));
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::InvalidArgument("Input con not be empty."));
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
int rank_ids = Attr<int>("rank_ids");
int rank_id = Attr<int>("rank");
int rid = Attr<int>("ring_id");
int device_id = place.device;
if (Attr<int>("device_id") >= 0) {
device_id = Attr<int>("device_id");
}
platform::HCCLCommContext::Instance().CreateHCCLComm(
hccl_id, rank_ids, rank_id, device_id, rid);
// Build comm
float* buff;
int32_t size = 20;
std::vector<float> input(size, 0);
for (int32_t idx = 0; idx < size; idx++) {
input[idx] = 1.0;
}
PADDLE_ENFORCE_NPU_SUCCESS(platform::RecordedNPUMalloc(
reinterpret_cast<void**>(&buff), size * sizeof(float), device_id));
platform::NPUMemcpySync(reinterpret_cast<void*>(buff),
input.data(),
size * sizeof(float),
ACL_MEMCPY_HOST_TO_DEVICE,
size * sizeof(float));
VLOG(3) << "Build buff data successful.";
aclrtStream stream = nullptr;
auto comm = paddle::platform::HCCLCommContext::Instance().Get(rid, place);
if (rank_id == 0) {
stream = comm->stream();
} else {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
}
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream));
// Synchronize stream to find hccl error in time.
platform::NPUStreamSync(stream);
VLOG(3) << "Build connection successful.";
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -27,83 +27,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
static void GenHCCLID(std::vector<HcclRootInfo>* hccl_ids) {
constexpr int timeout = 2 * 60 + 10; // 2MSL+10s
constexpr int retry_time = 1;
for (size_t i = 0; i < hccl_ids->size(); ++i) {
bool failed = true;
for (auto retry_times = 0; retry_times * retry_time < timeout;
++retry_times) {
auto err = platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i]);
if (err == 0) {
failed = false;
break;
}
std::this_thread::sleep_for(std::chrono::seconds(retry_time));
LOG(WARNING) << "HcclGetRootInfo failed, err is: " << err << ", retry "
<< retry_times << " times";
}
if (failed) {
PADDLE_THROW(platform::errors::External("HcclGetRootInfo failed!"));
}
}
}
static void CopyHCCLIDToVar(const std::vector<HcclRootInfo>& hccl_ids,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
for (size_t i = 0; i < hccl_ids.size(); ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto hccl_id = var->GetMutable<HcclRootInfo>();
memcpy(hccl_id, &hccl_ids[i], sizeof(HcclRootInfo));
}
}
class CGenHCCLIdOp : public framework::OperatorBase {
public:
CGenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
int rank = Attr<int>("rank");
int ring_id = Attr<int>("ring_id");
std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
return Output("Out");
};
std::string endpoint = Attr<std::string>("endpoint");
int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
std::vector<HcclRootInfo> hccl_ids;
hccl_ids.resize(1);
if (rank == 0) {
GenHCCLID(&hccl_ids);
std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("other_endpoints");
platform::SendBroadCastCommID(endpoint_list, &hccl_ids, ring_id);
} else {
platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids, ring_id);
}
CopyHCCLIDToVar(hccl_ids, func, scope);
}
};
#else
class CGenHCCLIdOp : public framework::OperatorBase {
public:
CGenHCCLIdOp(const std::string& type,
......@@ -116,8 +39,6 @@ class CGenHCCLIdOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {}
};
#endif
class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......
......@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/phi/core/ddim.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
......@@ -44,9 +44,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
......@@ -134,86 +131,8 @@ template <ReduceType red_type, typename T>
class CReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
int root_id = ctx.Attr<int>("root_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int rank_id = comm->rank();
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl reduce, parameter is: "
<< "input num: " << numel << "root_id: " << root_id
<< "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
<< ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
hccl_red_type,
comm->comm(),
reinterpret_cast<void*>(stream)));
if (rank_id != root_id) {
auto npu_place = place;
memory::Copy(npu_place,
reinterpret_cast<void*>(out->data<T>()),
npu_place,
reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
numel * sizeof(T),
stream);
}
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......@@ -433,10 +352,7 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the reduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
.SetDefault("tag");
#endif
AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
AddAttr<bool>(
"use_calc_stream",
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -50,10 +50,7 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("nranks",
"Total trainer count of the distributed training job")
.SetDefault(1);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -25,59 +21,8 @@ template <typename T>
class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
auto out_dims = in->dims();
PADDLE_ENFORCE_EQ(out_dims[0] % nranks,
0,
platform::errors::InvalidArgument(
"The input tensor X's "
"dim[0] (%d) should be divisible by nranks(%d)",
out_dims[0],
nranks));
out_dims[0] = out_dims[0] / nranks;
out->mutable_data<T>(out_dims, place);
uint64_t recv_numel = in->numel() / nranks;
void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* outputPtr = reinterpret_cast<void*>(out->data<T>());
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl reduce scatter, parameter is: "
<< "recv_numel: " << recv_numel << "dtype: " << dtype
<< "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclReduceScatter(inputPtr,
outputPtr,
recv_numel,
dtype,
HCCL_REDUCE_SUM,
comm->comm(),
reinterpret_cast<void*>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -47,17 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
platform::GpuStreamSync(dev_ctx->stream());
#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on npu place only for now."));
auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
platform::NPUStreamSync(dev_ctx->stream());
#elif defined(PADDLE_WITH_CNCL)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),
......
......@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
......@@ -45,19 +45,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
platform::GpuStreamSync(stream);
#elif defined(PADDLE_WITH_ASCEND_CL)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
true,
platform::errors::PreconditionNotMet(
"Sync comm stream op can run on npu place only for "
"now, but we got %s, please check the environment.",
place.DebugString()));
int ring_id = ctx.Attr<int>("ring_id");
auto stream =
platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
platform::NPUStreamSync(stream);
#elif defined(PADDLE_WITH_CNCL)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -32,10 +32,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -30,144 +30,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
class GenHCCLIdOp : public framework::OperatorBase {
public:
GenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
std::vector<std::string> trainers =
Attr<std::vector<std::string>>("trainers");
int trainer_id = Attr<int>("trainer_id");
std::string endpoint = trainers[trainer_id];
PADDLE_ENFORCE_GE(
trainer_id,
0,
platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
"valid range is [0, trainer_size)"));
PADDLE_ENFORCE_LT(
trainer_id,
static_cast<int>(trainers.size()),
platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
"range is [0, trainer_size)",
trainer_id));
int hccl_comm_num = Attr<int>("hccl_comm_num");
int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
int inter_trainer_id = -1;
int exter_trainer_id = -1;
if (use_hierarchical_allreduce) {
PADDLE_ENFORCE_GT(
trainers.size(),
1,
platform::errors::PreconditionNotMet(
"The number of collective trainers %llu <= 1", trainers.size()));
PADDLE_ENFORCE_GT(
inter_nranks,
1,
platform::errors::PreconditionNotMet(
"inter_nranks %d <= 1 while in hierarchical allreduce mode",
inter_nranks));
PADDLE_ENFORCE_EQ(
trainers.size() % inter_nranks,
0,
platform::errors::PreconditionNotMet(
"The number of trainers %llu mod inter_nranks %d is not equal 0",
trainers.size(),
inter_nranks));
inter_trainer_id = trainer_id % inter_nranks;
if (trainer_id % inter_nranks == 0) {
exter_trainer_id = trainer_id / inter_nranks;
}
}
std::ostringstream ss;
for (size_t i = 0; i < trainers.size(); i++) {
ss << trainers[i] << ",";
}
VLOG(1) << "trainer_id:" << trainer_id
<< ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
<< ", hccl_comm_num:" << hccl_comm_num
<< ", inter_nranks:" << inter_nranks
<< ", inter_trainer_id:" << inter_trainer_id
<< ", exter_trainer_id:" << exter_trainer_id
<< ", trainers:" << ss.str();
int server_fd = -1;
/// 1. init flat
std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
if (trainer_id == 0) {
// server endpoints
std::vector<std::string> flat_endpoints;
flat_endpoints.insert(
flat_endpoints.begin(), trainers.begin() + 1, trainers.end());
SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
} else {
server_fd = CreateListenSocket(endpoint);
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 2. hierarchical inter ncclid
func = platform::GetHierarchicalInterHCCLVarName;
if (inter_trainer_id == 0) {
std::ostringstream ss;
ss << endpoint;
std::vector<std::string> inter_endpoints;
for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
i < static_cast<int>(trainers.size());
i++) {
ss << ",";
inter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
} else if (inter_trainer_id > 0) {
VLOG(1) << "Hierarchical inter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 3. hierarchical exter ncclid
func = platform::GetHierarchicalExterHCCLVarName;
if (exter_trainer_id == 0) {
std::ostringstream ss;
std::vector<std::string> exter_endpoints;
ss << endpoint;
for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
ss << ",";
exter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
} else if (exter_trainer_id > 0) {
VLOG(1) << "Hierarchical exter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
// close socket server
if (trainer_id != 0) {
CloseSocket(server_fd);
}
}
};
#else
class GenHCCLIdOp : public framework::OperatorBase {
public:
GenHCCLIdOp(const std::string& type,
......@@ -180,8 +42,6 @@ class GenHCCLIdOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {}
};
#endif
class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......
......@@ -30,10 +30,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
DECLARE_int32(get_host_by_name_time);
namespace paddle {
......
......@@ -42,10 +42,7 @@ class MpAllReduceSumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result in model parallel.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -50,10 +50,7 @@ class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allgather result");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -24,67 +24,8 @@ template <typename T>
class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int64_t numel = in->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int rank = ctx.Attr<int>("rank");
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
PADDLE_ENFORCE_EQ(rank,
comm->rank(),
platform::errors::InvalidArgument(
"rank: %s should equal to %s", rank, comm->rank()));
PADDLE_ENFORCE_EQ(
(numel % nranks),
0,
platform::errors::InvalidArgument(
"The input numel (%d) must be divisible by nranks(%d)",
numel,
nranks));
framework::DDim dims = in->dims();
out->mutable_data<T>(dims, place);
int64_t send_numel = numel / nranks;
int offset = send_numel * rank;
void *send_buff =
reinterpret_cast<void *>(const_cast<T *>(in->data<T>()) + offset);
void *recv_buff = reinterpret_cast<void *>(out->data<T>());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group << ", ring_id is " << ring_id
<< ", nranks is " << nranks << ", rankid is " << rank;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllGather(send_buff,
recv_buff,
send_numel,
dtype,
comm->comm(),
reinterpret_cast<void *>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -98,12 +98,7 @@ class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
.SetDefault(5);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
.SetDefault(std::vector<int>());
AddAttr<bool>(
......
......@@ -22,57 +22,8 @@ template <typename T>
class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(out->dims(), ctx.GetPlace());
int num = ctx.Attr<int>("num");
int id = ctx.Attr<int>("id");
int recv_numel = out->numel() / num;
int offset = recv_numel * id;
void* ptr =
reinterpret_cast<void*>(const_cast<T*>(out->data<T>()) + offset);
int numel = recv_numel;
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int peer = ctx.Attr<int>("peer");
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = peer;
VLOG(3) << "begin hccl recv, parameter is: "
<< "ring_id:" << ring_id << ", nranks:" << nranks
<< ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
<< ", dtype:" << dtype << ", root:" << root
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -65,12 +65,7 @@ class PartialSendMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0);
AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -22,52 +22,8 @@ template <typename T>
class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
int num = ctx.Attr<int>("num");
int id = ctx.Attr<int>("id");
int send_numel = x->numel() / num;
int offset = send_numel * id;
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()) + offset);
int numel = send_numel;
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int rank = comm->rank();
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = rank;
VLOG(3) << "begin hccl send, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -87,12 +87,7 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
.SetDefault(5);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
.SetDefault(std::vector<int>());
AddAttr<bool>(
......
......@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
......@@ -27,59 +24,8 @@ template <typename T>
class CRecvOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(out->dims(), ctx.GetPlace());
void* ptr = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
int numel = out->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(ring_id)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(ring_id);
std::vector<phi::DenseTensor> out_tensor;
out_tensor.emplace_back(*out);
auto task = pg->Recv(out_tensor, 0);
return;
}
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int peer = ctx.Attr<int>("peer");
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = peer;
VLOG(3) << "begin hccl recv, parameter is: "
<< "ring_id:" << ring_id << ", nranks:" << nranks
<< ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
<< ", dtype:" << dtype << ", root:" << root
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -61,12 +61,7 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0);
AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
......
......@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
......@@ -27,56 +24,8 @@ template <typename T>
class CSendOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(ring_id)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(ring_id);
std::vector<phi::DenseTensor> in_tensor;
in_tensor.push_back(*x);
auto task = pg->Send(in_tensor, 1);
return;
}
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int rank = comm->rank();
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = rank;
VLOG(3) << "begin hccl send, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
......
......@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -84,12 +84,6 @@ class ConditionalOp : public framework::OperatorBase {
res = cpu_tensor.data<bool>()[0];
#endif
} else if (platform::is_npu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_ASCEND_CL
phi::DenseTensor cpu_tensor;
framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
res = cpu_tensor.data<bool>()[0];
#endif
} else if (platform::is_xpu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_XPU
phi::DenseTensor cpu_tensor;
......
......@@ -228,9 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) {
// platform::is_npu_place(cond.place()) or
// platform::is_xpu_place(cond.place()) is true
std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -16,8 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
......@@ -182,84 +181,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template <typename T>
class ConcatFunctor<platform::NPUDeviceContext, T> {
public:
void operator()(const platform::NPUDeviceContext& context,
const std::vector<phi::DenseTensor>& input,
int axis,
phi::DenseTensor* output) {
int dev_id = context.GetPlace().GetDeviceId();
platform::NPUDeviceGuard guard(dev_id);
std::vector<std::string> names;
for (size_t i = 0; i < input.size(); ++i) {
names.push_back("x" + std::to_string(i));
}
NpuOpRunner runner{
"ConcatD",
{input},
{*output},
{{"concat_dim", axis}, {"N", static_cast<int>(input.size())}}};
runner.AddInputNames(names);
runner.Run(context.stream());
}
};
template <typename T>
class SplitFunctor<platform::NPUDeviceContext, T> {
public:
void operator()(const platform::NPUDeviceContext& context,
const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& ref_inputs,
const int axis,
std::vector<phi::DenseTensor*>* outputs) {
if (input.numel() == 0) {
return;
}
size_t num = outputs->size();
int input_rows = 1;
auto dim_0 = ref_inputs[0]->dims();
for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i];
}
int input_cols = 0;
std::vector<int64_t> output_cols(outputs->size());
for (size_t i = 0; i < num; ++i) {
int t_cols = ref_inputs[i]->numel() / input_rows;
input_cols += t_cols;
output_cols[i] = t_cols;
}
auto npu_place = context.GetPlace();
// computation
for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0;
for (size_t j = 0; j < num; ++j) {
int col_len = output_cols[j];
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len;
memory::Copy(npu_place,
dst_ptr,
npu_place,
src_ptr + col_idx,
sizeof(T) * col_len,
context.stream());
}
col_idx += col_len;
}
}
}
};
#endif
#ifdef PADDLE_WITH_MLU
template <typename T>
class ConcatFunctor<platform::MLUDeviceContext, T> {
......@@ -369,14 +290,6 @@ DEFINE_XPU_FUNCTOR(float)
DEFINE_XPU_FUNCTOR(platform::float16)
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#define DEFINE_NPU_FUNCTOR(type) \
template class ConcatFunctor<platform::NPUDeviceContext, type>; \
template class SplitFunctor<platform::NPUDeviceContext, type>;
FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
......
......@@ -123,34 +123,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MemcpyH2DInferShapeFunctor);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d,
float,
ops::MemcpyH2DKernel,
double,
ops::MemcpyH2DKernel,
int8_t,
ops::MemcpyH2DKernel,
uint8_t,
ops::MemcpyH2DKernel,
int,
ops::MemcpyH2DKernel,
int64_t,
ops::MemcpyH2DKernel,
bool,
ops::MemcpyH2DKernel,
paddle::platform::bfloat16,
ops::MemcpyH2DKernel,
paddle::platform::complex<float>,
ops::MemcpyH2DKernel,
paddle::platform::complex<double>,
ops::MemcpyH2DKernel,
plat::float16,
ops::MemcpyH2DKernel,
int16_t,
ops::MemcpyH2DKernel);
#endif
#ifdef PADDLE_WITH_IPU
REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d,
float,
......
......@@ -145,19 +145,3 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MemcpyInferShapeFunctor);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy,
float,
ops::MemcpyKernel,
double,
ops::MemcpyKernel,
int,
ops::MemcpyKernel,
int64_t,
ops::MemcpyKernel,
bool,
ops::MemcpyKernel,
plat::float16,
ops::MemcpyKernel);
#endif
......@@ -61,14 +61,7 @@ class MemcpyFunctor {
lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
} else if (dst_place_type_ == DeviceType::CPU) {
framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
} else if (dst_place_type_ == DeviceType::NPU) { /* npu_pin->npu */
framework::TensorCopy(
lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
} else if (dst_place_type_ == DeviceType::NPU_PINNED) { /* npu->npu_pin */
framework::TensorCopy(
lod_tensor, platform::NPUPinnedPlace(), dev_ctx_, &out_tensor);
#endif
#ifdef PADDLE_WTIH_CUSTOM_DEVICE
} else if (dst_place_type_ == DeviceType::CUSTOM_DEVICE) {
framework::TensorCopy(
......
......@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
int dev_idx = place_.device;
compute_stream_ =
((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events_.resize(buffer_size);
for (auto &event : events_) {
event = platform::NpuEventResourcePool::Instance().New(dev_idx);
}
stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
}
#endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) {
int dev_idx = place_.device;
......@@ -275,56 +260,6 @@ void BufferedReader::ReadAsync(size_t i) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
TensorVec &npu = npu_buffer_[i];
if (npu.empty()) {
npu.resize(cpu.size());
} else {
PADDLE_ENFORCE_EQ(
npu.size(),
cpu.size(),
platform::errors::InvalidArgument(
"Input tensor number on NPU and CPU devices are not matched. "
"The number on NPU is %d, on CPU is %d",
npu.size(),
cpu.size()));
}
std::vector<void *> npu_ptrs;
npu_ptrs.reserve(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
npu[i].Resize(cpu[i].dims());
npu[i].set_layout(cpu[i].layout());
npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
}
platform::SetNPUDeviceId(place_.device);
platform::NPUEventRecord(events_[i].get(), compute_stream_);
platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
platform::RecordEvent record_event("BufferedReader:MemoryCopy",
platform::TracerEventType::UserDefined,
1);
for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data();
auto npu_ptr = npu_ptrs[i];
auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
if ((platform::is_npu_place(cpu_place))) {
memory::Copy(
place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
} else {
memory::Copy(
place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
platform::NPUStreamSync(stream_.get());
}
npu[i].set_lod(cpu[i].lod());
}
platform::NPUStreamSync(stream_.get());
}
#endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) {
TensorVec &mlu = mlu_buffer_[i];
......
......@@ -25,8 +25,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
......@@ -93,12 +92,6 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
aclrtStream compute_stream_;
std::shared_ptr<platform::NpuStreamObject> stream_;
std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
#endif
#ifdef PADDLE_WITH_MLU
mluStream compute_stream_;
std::shared_ptr<platform::MluStreamObject> stream_;
......
......@@ -11,19 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/run_program_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
/* see [Why use single type kernel] */
REGISTER_OP_NPU_KERNEL(
run_program,
ops::RunProgramOpKernel<paddle::platform::NPUDeviceContext, float>);
REGISTER_OP_NPU_KERNEL(
run_program_grad,
ops::RunProgramGradOpKernel<paddle::platform::NPUDeviceContext, float>);
#endif
......@@ -11,107 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ScatterNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* index = ctx.Input<phi::DenseTensor>("Ids");
auto* updates = ctx.Input<phi::DenseTensor>("Updates");
bool overwrite = ctx.Attr<bool>("overwrite");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
phi::DenseTensor tmp_tensor(index->type());
const auto index_dims = index->dims();
if (index_dims.size() == 1) {
tmp_tensor.ShareDataWith(*index);
std::vector<int64_t> new_dim = {index_dims[0], 1};
tmp_tensor.Resize(phi::make_ddim(new_dim));
index = &tmp_tensor;
}
const auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto op_func_update = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
runner.Run(dev_ctx.stream());
};
auto op_func_add = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs);
runner.Run(dev_ctx.stream());
};
if (overwrite) {
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
NpuOpRunner::TypeAdapter({*x, *index, *updates},
{*out},
{},
dev_ctx,
op_func_update,
{framework::proto::VarType::INT32,
framework::proto::VarType::INT32,
framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner_update = NpuOpRunner(
"TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
runner_update.Run(dev_ctx.stream());
}
} else {
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
NpuOpRunner::TypeAdapter({*x, *index, *updates},
{*out},
{},
dev_ctx,
op_func_add,
{framework::proto::VarType::INT32,
framework::proto::VarType::INT32,
framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner_add =
NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
runner_add.Run(dev_ctx.stream());
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
scatter,
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
#endif
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
......@@ -41,7 +41,7 @@ class SoftmaxWithCrossEntropyOpMaker
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation.")
.AsIntermediate();
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
AddOutput(
"Backprop",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
......@@ -135,7 +135,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
true,
platform::errors::InvalidArgument(
"Output(Softmax) should be not null."));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"),
true,
platform::errors::InvalidArgument(
......@@ -206,10 +206,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
}
ctx->SetOutputDim("Softmax", logits_dims);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
ctx->SetOutputDim("Backprop", logits_dims);
ctx->ShareLoD("Logits", /*->*/ "Backprop");
#endif
logits_dims[axis] = 1;
ctx->SetOutputDim("Loss", logits_dims);
......@@ -238,7 +235,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
true,
platform::errors::InvalidArgument(
"Input(Softmax) should be not null."));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"),
true,
platform::errors::InvalidArgument(
......@@ -327,7 +324,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
grad_op->SetType("softmax_with_cross_entropy_grad");
grad_op->SetInput("Label", this->Input("Label"));
grad_op->SetInput("Softmax", this->Output("Softmax"));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
grad_op->SetInput("Backprop", this->Output("Backprop"));
#endif
grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
......@@ -359,7 +356,7 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradInplaceInferer);
REGISTER_OP_VERSION(softmax_with_cross_entropy)
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
.AddCheckpoint(
R"ROC(
Add a new attribute [use_softmax] )ROC",
......
......@@ -127,11 +127,6 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor,
} else {
platform::CPUPlace cpu_place;
paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(print_tensor.place())) {
platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
}
#endif
data = cpu_tensor.data<T>();
}
......
......@@ -11,50 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/unsqueeze_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
unsqueeze,
ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze2,
ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze_grad,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, float>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, double>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, plat::float16>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int64_t>);
#endif
......@@ -92,7 +92,7 @@ inline T GetValue(const phi::DenseTensor* x) {
if (!platform::is_cpu_place(x->place())) {
phi::DenseTensor cpu_x;
framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx = pool.Get(x->place());
dev_ctx->Wait();
......
......@@ -147,118 +147,6 @@ class NCCLCommContext {
};
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
// In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class NPUDeviceContext;
#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
#define ENV_RANK_ID "PADDLE_TRAINER_ID"
class HCCLComm {
public:
virtual int ring_id() const = 0;
virtual int nranks() const = 0;
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual HcclComm comm() const = 0;
virtual aclrtStream stream() const = 0;
virtual NPUDeviceContext* dev_context() const = 0;
virtual ~HCCLComm() = default;
};
// A singleton HCCL communicator context reserves communication ring ids
class HCCLCommContext {
public:
static HCCLCommContext& Instance() {
static HCCLCommContext comm_ctx;
return comm_ctx;
}
HCCLComm* CreateHCCLComm(
HcclRootInfo* hccl_id, int nranks, int rank, int dev_id, int ring_id);
// a latter comm with the same dev_id and the same ring_id
// will override the former
HCCLComm* AssignHCCLComm(
HcclComm comm, int nranks, int rank, int dev_id, int ring_id);
// retrieve a communicator by the ring id in multiprocessing mode
HCCLComm* Get(int ring_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id),
0,
platform::errors::InvalidArgument(
"Communicator in ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(),
1,
platform::errors::InvalidArgument(
"One device id should be specified to retrieve from "
"multiple communicators."));
return comm_map_.at(ring_id).begin()->second.get();
}
// retrieve a communicator by the ring id and the device id
HCCLComm* Get(int ring_id, int dev_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id),
0,
platform::errors::InvalidArgument(
"Communicator of ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_GT(
comm_map_.at(ring_id).count(dev_id),
0,
platform::errors::InvalidArgument(
"Communicator at device id %d has not been initialized in ring %d.",
dev_id,
ring_id));
return comm_map_.at(ring_id).at(dev_id).get();
}
// retrieve a communicator by the ring id and place
HCCLComm* Get(int ring_id, Place place) const {
return Get(ring_id, place.device);
}
private:
// Init global hcom
HCCLCommContext() {}
// we may use group feature in the feature
// HCCLCommContext() { InitHcomWorldGroup(); }
HcclComm comm_;
public:
~HCCLCommContext() {}
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
// ring id to dev-HCCLComm
std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
// void InitHcomWorldGroup();
void ReleaseHCCLComms();
DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global
......
......@@ -266,51 +266,5 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
platform::GetCurrentNPUContext(&context_);
stream_.reset(new stream::NPUStream(place));
}
NPUDeviceContext::~NPUDeviceContext() {
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event(
"NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
VLOG(4) << "NPU context(" << this << ") Wait";
stream_->Wait();
}
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; }
NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
: place_(place) {
eigen_device_.reset(new Eigen::DefaultDevice());
}
Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif
} // namespace platform
} // namespace paddle
......@@ -34,10 +34,6 @@ limitations under the License. */
#include "xpu/bkcl.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#endif
......@@ -334,11 +330,7 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {
}
// TODO(WANGXI): maybe need to unify this hard code
#ifdef PADDLE_WITH_ASCEND_CL
#define MAX_COMMUNIQUEID_LEN 4108
#else
#define MAX_COMMUNIQUEID_LEN 1024
#endif
template <typename CommUniqueId>
static void RecvCommID(int conn, CommUniqueId* nccl_id) {
......@@ -456,9 +448,7 @@ INSTANT_TEMPLATE(ncclUniqueId)
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE(BKCLUniqueId)
#endif
#ifdef PADDLE_WITH_ASCEND_CL
INSTANT_TEMPLATE(HcclRootInfo)
#endif
#ifdef PADDLE_WITH_CNCL
INSTANT_TEMPLATE(cnclCliqueId)
#endif
......
......@@ -228,9 +228,7 @@ void InitDevices(const std::vector<int> devices) {
#ifdef PADDLE_WITH_IPU
places.emplace_back(platform::IPUPlace(devices[i]));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
places.emplace_back(platform::NPUPlace(devices[i]));
#endif
#ifdef PADDLE_WITH_MLU
places.emplace_back(platform::MLUPlace(devices[i]));
#endif
......
......@@ -19,8 +19,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
//
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/phi/common/place.h"
namespace paddle {
......@@ -95,24 +93,14 @@ typename Visitor::result_type VisitPlace(const Place &place,
#endif
}
case phi::AllocationType::NPU: {
#ifdef PADDLE_WITH_ASCEND_CL
platform::NPUPlace p(place.GetDeviceId());
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type();
#endif
}
case phi::AllocationType::NPUPINNED: {
#ifdef PADDLE_WITH_ASCEND_CL
platform::NPUPinnedPlace p;
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type();
#endif
}
case phi::AllocationType::IPU: {
#ifdef PADDLE_WITH_IPU
......
......@@ -33,11 +33,8 @@ static void StreamCallbackFunc(gpuStream_t stream,
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
static void StreamCallbackFunc(void *user_data)
#endif
#if PADDLE_WITH_MLU
static void StreamCallbackFunc(void *user_data)
static void StreamCallbackFunc(void *user_data)
#endif
{
std::unique_ptr<std::function<void()>> func(
......@@ -75,12 +72,6 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
// TODO(zhiqiu): failed to call aclrtLaunchCallback
NPULaunchCallback(StreamCallbackFunc, func, ACL_CALLBACK_BLOCK, stream_);
#endif
#if PADDLE_WITH_MLU
VLOG(3) << "MLULaunchCallback at stream: " << stream_;
cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
......@@ -94,9 +85,6 @@ void StreamCallbackManager<Stream>::Wait() const {
#endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUStreamSync(stream_);
#endif
{
std::lock_guard<std::mutex> lock(mtx_);
......@@ -112,9 +100,7 @@ template struct StreamCallbackManager<gpuStream_t>;
#ifdef PADDLE_WITH_HIP
template struct StreamCallbackManager<hipStream_t>;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template struct StreamCallbackManager<aclrtStream>;
#endif
#ifdef PADDLE_WITH_MLU
template struct StreamCallbackManager<mluStream>;
#endif
......
......@@ -26,12 +26,9 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/pybind/eager_generator.h"
#include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
#endif
#include "paddle/fluid/pybind/eager_generator.h"
// phi
#include "paddle/phi/kernels/declarations.h"
......@@ -485,11 +482,6 @@ int main(int argc, char* argv[]) {
return -1;
}
#ifdef PADDLE_WITH_ASCEND_CL
auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
ascend_ptr->InitGEForUT();
#endif
std::vector<std::string> headers{
"<Python.h>",
"\"paddle/fluid/platform/enforce.h\"",
......@@ -557,9 +549,5 @@ int main(int argc, char* argv[]) {
out.close();
#ifdef PADDLE_WITH_ASCEND_CL
ge::GEFinalize();
#endif
return 0;
}
......@@ -2547,9 +2547,9 @@ void BindImperative(py::module *m_ptr) {
},
py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_CNCL)
py::class_<imperative::ParallelContext,
std::shared_ptr<imperative::ParallelContext>>(m,
"ParallelContext");
......@@ -2630,7 +2630,7 @@ void BindImperative(py::module *m_ptr) {
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::HeterParallelContext,
imperative::ParallelContext,
std::shared_ptr<imperative::HeterParallelContext>>(
......
......@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx,
const void* src,
size_t num) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
defined(PADDLE_WITH_MLU)
memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
#else
PADDLE_THROW(
......
......@@ -101,9 +101,6 @@ int main(int argc, char** argv) {
int ret = RUN_ALL_TESTS();
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::AclInstance::Instance().Finalize();
#endif
if (env_str) free(env_str);
if (undefok_str) free(undefok_str);
return ret;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册