未验证 提交 80dd1672 编写于 作者: 张春乔 提交者: GitHub

mv PADDLE_WITH_ASCEND_CL (#52535)

上级 29c28e2f
...@@ -52,7 +52,7 @@ void MessageBus::Init( ...@@ -52,7 +52,7 @@ void MessageBus::Init(
} }
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_XPU_BKCL)
// NOTE: To make the brpc is compatible with collective, // NOTE: To make the brpc is compatible with collective,
// need release the handler holding the ip address. // need release the handler holding the ip address.
if (addr_ != "") { if (addr_ != "") {
......
...@@ -2128,12 +2128,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( ...@@ -2128,12 +2128,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
// CPUKernel will be executed and a warning will be given at the same // CPUKernel will be executed and a warning will be given at the same
// time. // time.
expected_kernel_key.place_ = platform::CPUPlace(); expected_kernel_key.place_ = platform::CPUPlace();
#ifdef PADDLE_WITH_ASCEND_CL
if (SupportNPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
}
#endif
if (platform::is_cpu_place(expected_kernel_key.place_)) { if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_ << "Op(" << type_
...@@ -2305,16 +2300,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -2305,16 +2300,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
platform::is_mlu_place(expected_kernel_key.place_)) { platform::is_mlu_place(expected_kernel_key.place_)) {
......
...@@ -150,15 +150,6 @@ AmpOperators::AmpOperators() ...@@ -150,15 +150,6 @@ AmpOperators::AmpOperators()
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end()); unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. // NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL)
auto unsupported_ops_npu_fp16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_npu_fp16.begin(),
unsupported_ops_npu_fp16.end());
auto unsupported_ops_npu_bf16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_npu_bf16.begin(),
unsupported_ops_npu_bf16.end());
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
auto unsupported_ops_xpu_fp16 = std::get<2>( auto unsupported_ops_xpu_fp16 = std::get<2>(
OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16)); OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
......
...@@ -34,8 +34,6 @@ ...@@ -34,8 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif #endif
...@@ -270,32 +268,6 @@ void TensorAdd(const VarType& src, VarType* dst) { ...@@ -270,32 +268,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
#endif #endif
} }
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place)) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<platform::NPUDeviceContext*>(ctx);
if (data_type == framework::DataTypeTrait<float>::DataType()) {
dst_tensor->mutable_data<float>(place);
} else if (data_type == framework::DataTypeTrait<double>::DataType()) {
dst_tensor->mutable_data<double>(place);
} else if (data_type ==
framework::DataTypeTrait<platform::float16>::DataType()) {
dst_tensor->mutable_data<platform::float16>(place);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type),
place));
}
const auto& runner = operators::NpuOpRunner(
"Add", {*dst_tensor, src_tensor}, {*dst_tensor}, {});
runner.Run(dev_ctx->stream());
return;
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place)) { if (platform::is_xpu_place(place)) {
if (data_type == framework::DataTypeTrait<float>::DataType()) { if (data_type == framework::DataTypeTrait<float>::DataType()) {
......
...@@ -12,62 +12,3 @@ ...@@ -12,62 +12,3 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/parallel_context.h"
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace imperative {
class HCCLParallelContext : public ParallelContext {
public:
explicit HCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~HCCLParallelContext() override = default;
void BcastHCCLId(const std::vector<HcclRootInfo>& hccl_ids,
int root, // NOLINT
int server_fd);
void Init() override;
void InitWithRingID(int ring_id) override;
void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst,
int ring_id,
bool use_calc_stream) override;
void Broadcast(framework::Variable* src, int ring_id) override;
paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
void WaitCompute(int ring_id) override;
void WaitComm(int ring_id) override;
void SynchronizeCompute() override;
private:
// used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
std::vector<std::shared_ptr<platform::NpuStreamObject>> compute_events_;
// used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
std::vector<std::shared_ptr<platform::NpuEventObject>> comm_events_;
};
} // namespace imperative
} // namespace paddle
#endif
...@@ -42,8 +42,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy, ...@@ -42,8 +42,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
: ParallelContext(strategy, platform::CUDAPlace(device_id)) : ParallelContext(strategy, platform::CUDAPlace(device_id))
#elif PADDLE_WITH_XPU_BKCL #elif PADDLE_WITH_XPU_BKCL
: ParallelContext(strategy, platform::XPUPlace(device_id)) : ParallelContext(strategy, platform::XPUPlace(device_id))
#elif PADDLE_WITH_ASCEND_CL
: ParallelContext(strategy, platform::NPUPlace(device_id))
#else #else
: ParallelContext(strategy, platform::CPUPlace()) : ParallelContext(strategy, platform::CPUPlace())
#endif #endif
...@@ -112,11 +110,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy, ...@@ -112,11 +110,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
node_parallel_ctx_ = node_parallel_ctx_ =
std::make_shared<BKCLParallelContext>(node_strategy_, node_place_); std::make_shared<BKCLParallelContext>(node_strategy_, node_place_);
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
node_place_ = platform::NPUPlace(device_id);
node_parallel_ctx_ =
std::make_shared<HCCLParallelContext>(node_strategy_, node_place_);
#endif
} }
void HeterParallelContext::Init() { void HeterParallelContext::Init() {
......
...@@ -24,11 +24,6 @@ ...@@ -24,11 +24,6 @@
#ifdef PADDLE_WITH_XPU_BKCL #ifdef PADDLE_WITH_XPU_BKCL
#include "paddle/fluid/imperative/bkcl_context.h" #include "paddle/fluid/imperative/bkcl_context.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/imperative/hccl_context.h"
#endif
#include "paddle/fluid/imperative/gloo_context.h" #include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/imperative/parallel_context.h"
......
...@@ -458,17 +458,6 @@ PreparedOp PrepareImpl( ...@@ -458,17 +458,6 @@ PreparedOp PrepareImpl(
} }
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
paddle::platform::is_npu_place(fluid_kernel_type.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << fluid_kernel_type
<< ", fallbacking to CPU one!";
fluid_kernel_type.place_ = platform::CPUPlace();
kernel_iter = kernels.find(fluid_kernel_type);
}
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
paddle::platform::is_ipu_place(fluid_kernel_type.place_)) { paddle::platform::is_ipu_place(fluid_kernel_type.place_)) {
......
...@@ -31,7 +31,7 @@ namespace imperative { ...@@ -31,7 +31,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) defined(PADDLE_WITH_CNCL)
// div the nranks // div the nranks
void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
phi::DenseTensor *tensor = phi::DenseTensor *tensor =
...@@ -305,17 +305,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { ...@@ -305,17 +305,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support.")); "Please recompile or reinstall Paddle with BKCL support."));
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
ConcatTensorsWithType(
static_cast<const platform::NPUDeviceContext &>(context),
dense_tensors_,
&dense_contents_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat npu grads since it's not compiled with HCCL," "Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support.")); "Please recompile or reinstall Paddle with HCCL support."));
#endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_CNCL #ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType( ConcatTensorsWithType(
...@@ -365,17 +358,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) { ...@@ -365,17 +358,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support.")); "Please recompile or reinstall Paddle with BKCL support."));
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
SplitTensorsWithType(
static_cast<const platform::NPUDeviceContext &>(context),
&dense_contents_,
&dense_tensors_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split npu grad since it's not compiled with HCCL," "Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support.")); "Please recompile or reinstall Paddle with HCCL support."));
#endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_CNCL #ifdef PADDLE_WITH_CNCL
SplitTensorsWithType( SplitTensorsWithType(
...@@ -1129,9 +1115,8 @@ void Reducer::FinalizeBackward() { ...@@ -1129,9 +1115,8 @@ void Reducer::FinalizeBackward() {
if (find_unused_vars_each_step_) { if (find_unused_vars_each_step_) {
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
ProcessUnusedDenseVars(); ProcessUnusedDenseVars();
#endif #endif
// Initialize local used vars // Initialize local used vars
......
...@@ -46,7 +46,7 @@ namespace imperative { ...@@ -46,7 +46,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) defined(PADDLE_WITH_CNCL)
template <typename T> template <typename T>
struct DivNRanksFunctor { struct DivNRanksFunctor {
......
...@@ -135,15 +135,10 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -135,15 +135,10 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
gc.reset(new framework::CPUGarbageCollector(place, 0)); gc.reset(new framework::CPUGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use NPU device since it's not compiled with NPU," "Paddle can't use NPU device since it's not compiled with NPU,"
"Please recompile or reinstall Paddle with NPU support.")); "Please recompile or reinstall Paddle with NPU support."));
#endif
} else if (platform::is_ipu_place(place)) { } else if (platform::is_ipu_place(place)) {
#if defined(PADDLE_WITH_IPU) #if defined(PADDLE_WITH_IPU)
gc.reset(new framework::IPUGarbageCollector(place, 0)); gc.reset(new framework::IPUGarbageCollector(place, 0));
...@@ -303,12 +298,8 @@ void Tracer::TraceOpImpl(const std::string& type, ...@@ -303,12 +298,8 @@ void Tracer::TraceOpImpl(const std::string& type,
"PaddlePaddle should compile with XPU if use XPUPlace.")); "PaddlePaddle should compile with XPU if use XPUPlace."));
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
platform::SetNPUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace.")); "PaddlePaddle should compile with NPU if use NPUPlace."));
#endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId(place.device); platform::SetMLUDeviceId(place.device);
......
...@@ -49,12 +49,6 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,12 +49,6 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Scale", AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of check_finite_and_unscale " "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator."); "operator.");
#ifdef PADDLE_WITH_ASCEND_CL
AddInput("FloatStatus",
"(Tensor) 1-dim tensor of shape [8], allocated by "
"alloc_float_status op")
.AsDispensable();
#endif
AddOutput("Out", AddOutput("Out",
"(Tensors) The scaled output tensor of " "(Tensors) The scaled output tensor of "
"check_finite_and_unscale operator.") "check_finite_and_unscale operator.")
......
...@@ -57,26 +57,7 @@ struct FillConstantVisitor { ...@@ -57,26 +57,7 @@ struct FillConstantVisitor {
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value || void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type std::is_same<T, int16_t>::value)>::type
* = nullptr) const { * = nullptr) const {
#ifdef PADDLE_WITH_ASCEND_CL #if defined(PADDLE_WITH_MLU)
if (platform::is_npu_place(dev_ctx_.GetPlace())) {
phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
const auto &runner =
NpuOpRunner("FillD",
{tensor_tmp},
{*tensor_},
{{"dims", phi::vectorize(tensor_->dims())}});
auto stream =
context_.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
} else {
phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
#elif defined(PADDLE_WITH_MLU)
if (platform::is_mlu_place(context_.GetPlace())) { if (platform::is_mlu_place(context_.GetPlace())) {
FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_); FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
} else { } else {
...@@ -235,12 +216,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -235,12 +216,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
// Init the continuous space // Init the continuous space
size_t offset = 0; size_t offset = 0;
if (context.Attr<bool>("copy_data")) { if (context.Attr<bool>("copy_data")) {
#ifdef PADDLE_WITH_ASCEND_CL
framework::VisitDataType(
dtype,
FillConstantVisitor<DeviceContext>(
dev_ctx, fused_tensor, static_cast<float>(0.0), dtype, context));
#endif
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
size_t len = static_cast<size_t>(in_tensors[i]->numel()); size_t len = static_cast<size_t>(in_tensors[i]->numel());
auto sub_tensor = fused_tensor->Slice( auto sub_tensor = fused_tensor->Slice(
...@@ -534,25 +509,6 @@ REGISTER_OPERATOR(coalesce_tensor, ...@@ -534,25 +509,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_NPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
#endif
#if defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL( REGISTER_OP_MLU_KERNEL(
coalesce_tensor, coalesce_tensor,
......
...@@ -44,10 +44,6 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -44,10 +44,6 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allgather result"); AddOutput("Out", "(Tensor) the allgather result");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -16,10 +16,6 @@ limitations under the License. */ ...@@ -16,10 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,51 +23,8 @@ template <typename T> ...@@ -27,51 +23,8 @@ template <typename T>
class CAllGatherOpASCENDKernel : public framework::OpKernel<T> { class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
framework::DDim out_dims = in->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);
uint64_t send_numel = in->numel();
void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
void *recv_buff = reinterpret_cast<void *>(out->data<T>());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group << ", ring_id is " << ring_id
<< ", nranks is " << nranks;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllGather(send_buff,
recv_buff,
send_numel,
dtype,
comm->comm(),
reinterpret_cast<void *>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -34,10 +34,6 @@ limitations under the License. */ ...@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -34,10 +34,6 @@ limitations under the License. */ ...@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -24,9 +24,8 @@ limitations under the License. */ ...@@ -24,9 +24,8 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
...@@ -44,17 +43,10 @@ limitations under the License. */ ...@@ -44,17 +43,10 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h" #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool(hccl_check_nan);
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -150,177 +142,12 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> { ...@@ -150,177 +142,12 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \ template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {}; class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_nan or return false;
inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
aclrtStream stream,
const phi::DenseTensor* in) {
phi::DenseTensor out(in->type());
phi::DenseTensor mean(in->type());
mean.Resize({1});
mean.mutable_data<float>(dev_ctx.GetPlace());
std::vector<int> axes;
for (int i = 0; i < in->dims().size(); ++i) {
axes.push_back(i);
}
std::vector<float> vec;
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
paddle::framework::TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
return true;
}
VLOG(4) << "reducemeand result:" << vec[0];
if (std::isnan(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects nan";
return true;
}
if (std::isinf(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects inf";
}
return false;
}
#endif
template <ReduceType red_type, typename T> template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> { class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
if (ctx.HasInput("Cond")) {
auto cond = ctx.Input<phi::DenseTensor>("Cond");
auto place = cond->place();
PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The input `cond` tensor should be on cpu place"));
PADDLE_ENFORCE_EQ(cond->numel(),
1,
platform::errors::PreconditionNotMet(
"The input `cond` should be shape [1]"));
if (!cond->data<bool>()[0]) {
VLOG(4) << "Skip all reduce Op since cond is 0";
return;
}
}
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
out->mutable_data<T>(in->dims(), ctx.GetPlace());
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
if (ctx.Attr<bool>("use_calc_stream")) {
stream = dev_ctx->stream();
} else {
stream = comm->stream();
}
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "hccl allreduce, parameter is: "
<< "input num: " << in->dims() << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group
<< ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
<< ", out_size:" << out->memory_size()
<< ", use_calc_stream:" << ctx.Attr<bool>("use_calc_stream")
<< ", stream:" << stream;
phi::DenseTensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());
bool found_nan = false;
auto d_type = framework::TransToProtoVarType(in->dtype());
switch (d_type) {
case framework::proto::VarType::FP16: {
break;
}
case framework::proto::VarType::FP32: {
if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf";
// NOTE: performance relating, DO NOT REMOVE!
ContainsNan(*dev_ctx, dev_ctx->stream(), in);
}
break;
}
default:
break;
}
if (found_nan) {
T inf = static_cast<T>(std::numeric_limits<float>::infinity());
VLOG(4) << "fill input data constant inf";
auto dims = in->dims();
auto mutable_in = const_cast<phi::DenseTensor*>(in);
FillNpuTensorWithConstant<T>(mutable_in, inf);
mutable_in->Resize(dims);
}
VLOG(3) << "hccl allreduce, parameter is: "
<< "input num: " << numel << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group
<< ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
<< ", out_size:" << out->memory_size();
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
hccl_red_type,
comm->comm(),
reinterpret_cast<void*>(stream)));
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
...@@ -616,10 +443,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -616,10 +443,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result."); AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1 // Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test // DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0 // Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
......
...@@ -42,10 +42,7 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,10 +42,7 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0); .SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.") AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -14,10 +14,6 @@ limitations under the License. */ ...@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,58 +21,8 @@ template <typename T> ...@@ -25,58 +21,8 @@ template <typename T>
class CBroadcastOpASCENDKernel : public framework::OpKernel<T> { class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
auto out = ctx.Output<phi::DenseTensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
VLOG(3) << "begin hccl broadcast, parameter is: "
<< "root " << root << ", group is " << group
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
<< phi::product(out->dims());
dev_ctx->Wait();
if (out != x) {
framework::TensorCopy(*static_cast<const phi::DenseTensor*>(x),
place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<phi::DenseTensor*>(out));
}
dev_ctx->Wait();
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -21,11 +21,6 @@ namespace framework { ...@@ -21,11 +21,6 @@ namespace framework {
class Scope; class Scope;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_ASCEND_CL)
#include "hccl/hccl.h"
#include "hccl/hccl_types.h"
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -48,52 +43,9 @@ class CCommInitOpAscend : public framework::OperatorBase { ...@@ -48,52 +43,9 @@ class CCommInitOpAscend : public framework::OperatorBase {
auto var = scope.FindVar(Input("X")); auto var = scope.FindVar(Input("X"));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::InvalidArgument("Input con not be empty.")); var, platform::errors::InvalidArgument("Input con not be empty."));
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
int rank_ids = Attr<int>("rank_ids");
int rank_id = Attr<int>("rank");
int rid = Attr<int>("ring_id");
int device_id = place.device;
if (Attr<int>("device_id") >= 0) {
device_id = Attr<int>("device_id");
}
platform::HCCLCommContext::Instance().CreateHCCLComm(
hccl_id, rank_ids, rank_id, device_id, rid);
// Build comm
float* buff;
int32_t size = 20;
std::vector<float> input(size, 0);
for (int32_t idx = 0; idx < size; idx++) {
input[idx] = 1.0;
}
PADDLE_ENFORCE_NPU_SUCCESS(platform::RecordedNPUMalloc(
reinterpret_cast<void**>(&buff), size * sizeof(float), device_id));
platform::NPUMemcpySync(reinterpret_cast<void*>(buff),
input.data(),
size * sizeof(float),
ACL_MEMCPY_HOST_TO_DEVICE,
size * sizeof(float));
VLOG(3) << "Build buff data successful.";
aclrtStream stream = nullptr;
auto comm = paddle::platform::HCCLCommContext::Instance().Get(rid, place);
if (rank_id == 0) {
stream = comm->stream();
} else {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
}
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream));
// Synchronize stream to find hccl error in time.
platform::NPUStreamSync(stream);
VLOG(3) << "Build connection successful.";
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -27,83 +27,6 @@ limitations under the License. */ ...@@ -27,83 +27,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
static void GenHCCLID(std::vector<HcclRootInfo>* hccl_ids) {
constexpr int timeout = 2 * 60 + 10; // 2MSL+10s
constexpr int retry_time = 1;
for (size_t i = 0; i < hccl_ids->size(); ++i) {
bool failed = true;
for (auto retry_times = 0; retry_times * retry_time < timeout;
++retry_times) {
auto err = platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i]);
if (err == 0) {
failed = false;
break;
}
std::this_thread::sleep_for(std::chrono::seconds(retry_time));
LOG(WARNING) << "HcclGetRootInfo failed, err is: " << err << ", retry "
<< retry_times << " times";
}
if (failed) {
PADDLE_THROW(platform::errors::External("HcclGetRootInfo failed!"));
}
}
}
static void CopyHCCLIDToVar(const std::vector<HcclRootInfo>& hccl_ids,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
for (size_t i = 0; i < hccl_ids.size(); ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto hccl_id = var->GetMutable<HcclRootInfo>();
memcpy(hccl_id, &hccl_ids[i], sizeof(HcclRootInfo));
}
}
class CGenHCCLIdOp : public framework::OperatorBase {
public:
CGenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
int rank = Attr<int>("rank");
int ring_id = Attr<int>("ring_id");
std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
return Output("Out");
};
std::string endpoint = Attr<std::string>("endpoint");
int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
std::vector<HcclRootInfo> hccl_ids;
hccl_ids.resize(1);
if (rank == 0) {
GenHCCLID(&hccl_ids);
std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("other_endpoints");
platform::SendBroadCastCommID(endpoint_list, &hccl_ids, ring_id);
} else {
platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids, ring_id);
}
CopyHCCLIDToVar(hccl_ids, func, scope);
}
};
#else
class CGenHCCLIdOp : public framework::OperatorBase { class CGenHCCLIdOp : public framework::OperatorBase {
public: public:
CGenHCCLIdOp(const std::string& type, CGenHCCLIdOp(const std::string& type,
...@@ -116,8 +39,6 @@ class CGenHCCLIdOp : public framework::OperatorBase { ...@@ -116,8 +39,6 @@ class CGenHCCLIdOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {} const platform::Place& dev_place) const override {}
}; };
#endif
class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
...@@ -44,9 +44,6 @@ limitations under the License. */ ...@@ -44,9 +44,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h" #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif #endif
...@@ -134,86 +131,8 @@ template <ReduceType red_type, typename T> ...@@ -134,86 +131,8 @@ template <ReduceType red_type, typename T>
class CReduceOpASCENDKernel : public framework::OpKernel<T> { class CReduceOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
int root_id = ctx.Attr<int>("root_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int rank_id = comm->rank();
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl reduce, parameter is: "
<< "input num: " << numel << "root_id: " << root_id
<< "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
<< ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
hccl_red_type,
comm->comm(),
reinterpret_cast<void*>(stream)));
if (rank_id != root_id) {
auto npu_place = place;
memory::Copy(npu_place,
reinterpret_cast<void*>(out->data<T>()),
npu_place,
reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
numel * sizeof(T),
stream);
}
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
...@@ -433,10 +352,7 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -433,10 +352,7 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the reduced result."); AddOutput("Out", "(Tensor) the reduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
.SetDefault("tag");
#endif
AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0); AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -50,10 +50,7 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,10 +50,7 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("nranks", AddAttr<int>("nranks",
"Total trainer count of the distributed training job") "Total trainer count of the distributed training job")
.SetDefault(1); .SetDefault(1);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -14,10 +14,6 @@ limitations under the License. */ ...@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,59 +21,8 @@ template <typename T> ...@@ -25,59 +21,8 @@ template <typename T>
class CReduceScatterOpAscendKernel : public framework::OpKernel<T> { class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
auto out_dims = in->dims();
PADDLE_ENFORCE_EQ(out_dims[0] % nranks,
0,
platform::errors::InvalidArgument(
"The input tensor X's "
"dim[0] (%d) should be divisible by nranks(%d)",
out_dims[0],
nranks));
out_dims[0] = out_dims[0] / nranks;
out->mutable_data<T>(out_dims, place);
uint64_t recv_numel = in->numel() / nranks;
void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* outputPtr = reinterpret_cast<void*>(out->data<T>());
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl reduce scatter, parameter is: "
<< "recv_numel: " << recv_numel << "dtype: " << dtype
<< "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclReduceScatter(inputPtr,
outputPtr,
recv_numel,
dtype,
HCCL_REDUCE_SUM,
comm->comm(),
reinterpret_cast<void*>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -34,10 +34,6 @@ limitations under the License. */ ...@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -47,17 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> { ...@@ -47,17 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
platform::GpuStreamSync(dev_ctx->stream()); platform::GpuStreamSync(dev_ctx->stream());
#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on npu place only for now."));
auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
platform::NPUStreamSync(dev_ctx->stream());
#elif defined(PADDLE_WITH_CNCL) #elif defined(PADDLE_WITH_CNCL)
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/cncl_helper.h" #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
...@@ -45,19 +45,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> { ...@@ -45,19 +45,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
platform::GpuStreamSync(stream); platform::GpuStreamSync(stream);
#elif defined(PADDLE_WITH_ASCEND_CL)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
true,
platform::errors::PreconditionNotMet(
"Sync comm stream op can run on npu place only for "
"now, but we got %s, please check the environment.",
place.DebugString()));
int ring_id = ctx.Attr<int>("ring_id");
auto stream =
platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
platform::NPUStreamSync(stream);
#elif defined(PADDLE_WITH_CNCL) #elif defined(PADDLE_WITH_CNCL)
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -32,10 +32,6 @@ limitations under the License. */ ...@@ -32,10 +32,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -30,144 +30,6 @@ limitations under the License. */ ...@@ -30,144 +30,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
class GenHCCLIdOp : public framework::OperatorBase {
public:
GenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
std::vector<std::string> trainers =
Attr<std::vector<std::string>>("trainers");
int trainer_id = Attr<int>("trainer_id");
std::string endpoint = trainers[trainer_id];
PADDLE_ENFORCE_GE(
trainer_id,
0,
platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
"valid range is [0, trainer_size)"));
PADDLE_ENFORCE_LT(
trainer_id,
static_cast<int>(trainers.size()),
platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
"range is [0, trainer_size)",
trainer_id));
int hccl_comm_num = Attr<int>("hccl_comm_num");
int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
int inter_trainer_id = -1;
int exter_trainer_id = -1;
if (use_hierarchical_allreduce) {
PADDLE_ENFORCE_GT(
trainers.size(),
1,
platform::errors::PreconditionNotMet(
"The number of collective trainers %llu <= 1", trainers.size()));
PADDLE_ENFORCE_GT(
inter_nranks,
1,
platform::errors::PreconditionNotMet(
"inter_nranks %d <= 1 while in hierarchical allreduce mode",
inter_nranks));
PADDLE_ENFORCE_EQ(
trainers.size() % inter_nranks,
0,
platform::errors::PreconditionNotMet(
"The number of trainers %llu mod inter_nranks %d is not equal 0",
trainers.size(),
inter_nranks));
inter_trainer_id = trainer_id % inter_nranks;
if (trainer_id % inter_nranks == 0) {
exter_trainer_id = trainer_id / inter_nranks;
}
}
std::ostringstream ss;
for (size_t i = 0; i < trainers.size(); i++) {
ss << trainers[i] << ",";
}
VLOG(1) << "trainer_id:" << trainer_id
<< ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
<< ", hccl_comm_num:" << hccl_comm_num
<< ", inter_nranks:" << inter_nranks
<< ", inter_trainer_id:" << inter_trainer_id
<< ", exter_trainer_id:" << exter_trainer_id
<< ", trainers:" << ss.str();
int server_fd = -1;
/// 1. init flat
std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
if (trainer_id == 0) {
// server endpoints
std::vector<std::string> flat_endpoints;
flat_endpoints.insert(
flat_endpoints.begin(), trainers.begin() + 1, trainers.end());
SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
} else {
server_fd = CreateListenSocket(endpoint);
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 2. hierarchical inter ncclid
func = platform::GetHierarchicalInterHCCLVarName;
if (inter_trainer_id == 0) {
std::ostringstream ss;
ss << endpoint;
std::vector<std::string> inter_endpoints;
for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
i < static_cast<int>(trainers.size());
i++) {
ss << ",";
inter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
} else if (inter_trainer_id > 0) {
VLOG(1) << "Hierarchical inter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 3. hierarchical exter ncclid
func = platform::GetHierarchicalExterHCCLVarName;
if (exter_trainer_id == 0) {
std::ostringstream ss;
std::vector<std::string> exter_endpoints;
ss << endpoint;
for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
ss << ",";
exter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
} else if (exter_trainer_id > 0) {
VLOG(1) << "Hierarchical exter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
// close socket server
if (trainer_id != 0) {
CloseSocket(server_fd);
}
}
};
#else
class GenHCCLIdOp : public framework::OperatorBase { class GenHCCLIdOp : public framework::OperatorBase {
public: public:
GenHCCLIdOp(const std::string& type, GenHCCLIdOp(const std::string& type,
...@@ -180,8 +42,6 @@ class GenHCCLIdOp : public framework::OperatorBase { ...@@ -180,8 +42,6 @@ class GenHCCLIdOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {} const platform::Place& dev_place) const override {}
}; };
#endif
class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
......
...@@ -30,10 +30,6 @@ limitations under the License. */ ...@@ -30,10 +30,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
DECLARE_int32(get_host_by_name_time); DECLARE_int32(get_host_by_name_time);
namespace paddle { namespace paddle {
......
...@@ -42,10 +42,7 @@ class MpAllReduceSumOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,10 +42,7 @@ class MpAllReduceSumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result in model parallel."); AddOutput("Out", "(Tensor) the allreduced result in model parallel.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -50,10 +50,7 @@ class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,10 +50,7 @@ class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allgather result"); AddOutput("Out", "(Tensor) the allgather result");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -24,67 +24,8 @@ template <typename T> ...@@ -24,67 +24,8 @@ template <typename T>
class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> { class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int64_t numel = in->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
int rank = ctx.Attr<int>("rank");
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
PADDLE_ENFORCE_EQ(rank,
comm->rank(),
platform::errors::InvalidArgument(
"rank: %s should equal to %s", rank, comm->rank()));
PADDLE_ENFORCE_EQ(
(numel % nranks),
0,
platform::errors::InvalidArgument(
"The input numel (%d) must be divisible by nranks(%d)",
numel,
nranks));
framework::DDim dims = in->dims();
out->mutable_data<T>(dims, place);
int64_t send_numel = numel / nranks;
int offset = send_numel * rank;
void *send_buff =
reinterpret_cast<void *>(const_cast<T *>(in->data<T>()) + offset);
void *recv_buff = reinterpret_cast<void *>(out->data<T>());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group << ", ring_id is " << ring_id
<< ", nranks is " << nranks << ", rankid is " << rank;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclAllGather(send_buff,
recv_buff,
send_numel,
dtype,
comm->comm(),
reinterpret_cast<void *>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -98,12 +98,7 @@ class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -98,12 +98,7 @@ class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.") AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
.SetDefault(5); .SetDefault(5);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.") AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
.SetDefault(std::vector<int>()); .SetDefault(std::vector<int>());
AddAttr<bool>( AddAttr<bool>(
......
...@@ -22,57 +22,8 @@ template <typename T> ...@@ -22,57 +22,8 @@ template <typename T>
class PartialRecvOpASCENDKernel : public framework::OpKernel<T> { class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(out->dims(), ctx.GetPlace());
int num = ctx.Attr<int>("num");
int id = ctx.Attr<int>("id");
int recv_numel = out->numel() / num;
int offset = recv_numel * id;
void* ptr =
reinterpret_cast<void*>(const_cast<T*>(out->data<T>()) + offset);
int numel = recv_numel;
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int peer = ctx.Attr<int>("peer");
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = peer;
VLOG(3) << "begin hccl recv, parameter is: "
<< "ring_id:" << ring_id << ", nranks:" << nranks
<< ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
<< ", dtype:" << dtype << ", root:" << root
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -65,12 +65,7 @@ class PartialSendMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,12 +65,7 @@ class PartialSendMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.") AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -22,52 +22,8 @@ template <typename T> ...@@ -22,52 +22,8 @@ template <typename T>
class PartialSendOpASCENDKernel : public framework::OpKernel<T> { class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
int num = ctx.Attr<int>("num");
int id = ctx.Attr<int>("id");
int send_numel = x->numel() / num;
int offset = send_numel * id;
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()) + offset);
int numel = send_numel;
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int rank = comm->rank();
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = rank;
VLOG(3) << "begin hccl send, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -87,12 +87,7 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { ...@@ -87,12 +87,7 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.") AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
.SetDefault(5); .SetDefault(5);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.") AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
.SetDefault(std::vector<int>()); .SetDefault(std::vector<int>());
AddAttr<bool>( AddAttr<bool>(
......
...@@ -14,9 +14,6 @@ limitations under the License. */ ...@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h" #include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/include/tensor.h"
...@@ -27,59 +24,8 @@ template <typename T> ...@@ -27,59 +24,8 @@ template <typename T>
class CRecvOpASCENDKernel : public framework::OpKernel<T> { class CRecvOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(out->dims(), ctx.GetPlace());
void* ptr = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
int numel = out->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(ring_id)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(ring_id);
std::vector<phi::DenseTensor> out_tensor;
out_tensor.emplace_back(*out);
auto task = pg->Recv(out_tensor, 0);
return;
}
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int peer = ctx.Attr<int>("peer");
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = peer;
VLOG(3) << "begin hccl recv, parameter is: "
<< "ring_id:" << ring_id << ", nranks:" << nranks
<< ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
<< ", dtype:" << dtype << ", root:" << root
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -61,12 +61,7 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { ...@@ -61,12 +61,7 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.") AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -14,9 +14,6 @@ limitations under the License. */ ...@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h" #include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/include/tensor.h"
...@@ -27,56 +24,8 @@ template <typename T> ...@@ -27,56 +24,8 @@ template <typename T>
class CSendOpASCENDKernel : public framework::OpKernel<T> { class CSendOpASCENDKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<phi::DenseTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype =
platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
int ring_id = ctx.Attr<int>("ring_id");
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(ring_id)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(ring_id);
std::vector<phi::DenseTensor> in_tensor;
in_tensor.push_back(*x);
auto task = pg->Send(in_tensor, 1);
return;
}
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int rank = comm->rank();
PADDLE_ENFORCE_EQ(nranks,
2,
platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = rank;
VLOG(3) << "begin hccl send, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU.")); "PaddlePaddle should compile with NPU."));
#endif
} }
}; };
......
...@@ -31,10 +31,6 @@ limitations under the License. */ ...@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
......
...@@ -84,12 +84,6 @@ class ConditionalOp : public framework::OperatorBase { ...@@ -84,12 +84,6 @@ class ConditionalOp : public framework::OperatorBase {
res = cpu_tensor.data<bool>()[0]; res = cpu_tensor.data<bool>()[0];
#endif #endif
} else if (platform::is_npu_place(ips[0]->place())) { } else if (platform::is_npu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_ASCEND_CL
phi::DenseTensor cpu_tensor;
framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
res = cpu_tensor.data<bool>()[0];
#endif
} else if (platform::is_xpu_place(ips[0]->place())) { } else if (platform::is_xpu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
phi::DenseTensor cpu_tensor; phi::DenseTensor cpu_tensor;
......
...@@ -228,9 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) { ...@@ -228,9 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) {
// platform::is_npu_place(cond.place()) or // platform::is_npu_place(cond.place()) or
// platform::is_xpu_place(cond.place()) is true // platform::is_xpu_place(cond.place()) is true
std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()}; std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -16,8 +16,7 @@ limitations under the License. */ ...@@ -16,8 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif #endif
...@@ -182,84 +181,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> { ...@@ -182,84 +181,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
}; };
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
template <typename T>
class ConcatFunctor<platform::NPUDeviceContext, T> {
public:
void operator()(const platform::NPUDeviceContext& context,
const std::vector<phi::DenseTensor>& input,
int axis,
phi::DenseTensor* output) {
int dev_id = context.GetPlace().GetDeviceId();
platform::NPUDeviceGuard guard(dev_id);
std::vector<std::string> names;
for (size_t i = 0; i < input.size(); ++i) {
names.push_back("x" + std::to_string(i));
}
NpuOpRunner runner{
"ConcatD",
{input},
{*output},
{{"concat_dim", axis}, {"N", static_cast<int>(input.size())}}};
runner.AddInputNames(names);
runner.Run(context.stream());
}
};
template <typename T>
class SplitFunctor<platform::NPUDeviceContext, T> {
public:
void operator()(const platform::NPUDeviceContext& context,
const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& ref_inputs,
const int axis,
std::vector<phi::DenseTensor*>* outputs) {
if (input.numel() == 0) {
return;
}
size_t num = outputs->size();
int input_rows = 1;
auto dim_0 = ref_inputs[0]->dims();
for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i];
}
int input_cols = 0;
std::vector<int64_t> output_cols(outputs->size());
for (size_t i = 0; i < num; ++i) {
int t_cols = ref_inputs[i]->numel() / input_rows;
input_cols += t_cols;
output_cols[i] = t_cols;
}
auto npu_place = context.GetPlace();
// computation
for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0;
for (size_t j = 0; j < num; ++j) {
int col_len = output_cols[j];
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len;
memory::Copy(npu_place,
dst_ptr,
npu_place,
src_ptr + col_idx,
sizeof(T) * col_len,
context.stream());
}
col_idx += col_len;
}
}
}
};
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
template <typename T> template <typename T>
class ConcatFunctor<platform::MLUDeviceContext, T> { class ConcatFunctor<platform::MLUDeviceContext, T> {
...@@ -369,14 +290,6 @@ DEFINE_XPU_FUNCTOR(float) ...@@ -369,14 +290,6 @@ DEFINE_XPU_FUNCTOR(float)
DEFINE_XPU_FUNCTOR(platform::float16) DEFINE_XPU_FUNCTOR(platform::float16)
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#define DEFINE_NPU_FUNCTOR(type) \
template class ConcatFunctor<platform::NPUDeviceContext, type>; \
template class SplitFunctor<platform::NPUDeviceContext, type>;
FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \ #define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \ template class ConcatFunctor<platform::MLUDeviceContext, type>; \
......
...@@ -123,34 +123,6 @@ REGISTER_OPERATOR( ...@@ -123,34 +123,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MemcpyH2DInferShapeFunctor); MemcpyH2DInferShapeFunctor);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d,
float,
ops::MemcpyH2DKernel,
double,
ops::MemcpyH2DKernel,
int8_t,
ops::MemcpyH2DKernel,
uint8_t,
ops::MemcpyH2DKernel,
int,
ops::MemcpyH2DKernel,
int64_t,
ops::MemcpyH2DKernel,
bool,
ops::MemcpyH2DKernel,
paddle::platform::bfloat16,
ops::MemcpyH2DKernel,
paddle::platform::complex<float>,
ops::MemcpyH2DKernel,
paddle::platform::complex<double>,
ops::MemcpyH2DKernel,
plat::float16,
ops::MemcpyH2DKernel,
int16_t,
ops::MemcpyH2DKernel);
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d, REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d,
float, float,
......
...@@ -145,19 +145,3 @@ REGISTER_OPERATOR( ...@@ -145,19 +145,3 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MemcpyInferShapeFunctor); MemcpyInferShapeFunctor);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy,
float,
ops::MemcpyKernel,
double,
ops::MemcpyKernel,
int,
ops::MemcpyKernel,
int64_t,
ops::MemcpyKernel,
bool,
ops::MemcpyKernel,
plat::float16,
ops::MemcpyKernel);
#endif
...@@ -61,14 +61,7 @@ class MemcpyFunctor { ...@@ -61,14 +61,7 @@ class MemcpyFunctor {
lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
} else if (dst_place_type_ == DeviceType::CPU) { } else if (dst_place_type_ == DeviceType::CPU) {
framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor); framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
} else if (dst_place_type_ == DeviceType::NPU) { /* npu_pin->npu */
framework::TensorCopy(
lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
} else if (dst_place_type_ == DeviceType::NPU_PINNED) { /* npu->npu_pin */
framework::TensorCopy(
lod_tensor, platform::NPUPinnedPlace(), dev_ctx_, &out_tensor);
#endif
#ifdef PADDLE_WTIH_CUSTOM_DEVICE #ifdef PADDLE_WTIH_CUSTOM_DEVICE
} else if (dst_place_type_ == DeviceType::CUSTOM_DEVICE) { } else if (dst_place_type_ == DeviceType::CUSTOM_DEVICE) {
framework::TensorCopy( framework::TensorCopy(
......
...@@ -63,21 +63,6 @@ BufferedReader::BufferedReader( ...@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
int dev_idx = place_.device;
compute_stream_ =
((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events_.resize(buffer_size);
for (auto &event : events_) {
event = platform::NpuEventResourcePool::Instance().New(dev_idx);
}
stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) { if (platform::is_mlu_place(place_)) {
int dev_idx = place_.device; int dev_idx = place_.device;
...@@ -275,56 +260,6 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -275,56 +260,6 @@ void BufferedReader::ReadAsync(size_t i) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
TensorVec &npu = npu_buffer_[i];
if (npu.empty()) {
npu.resize(cpu.size());
} else {
PADDLE_ENFORCE_EQ(
npu.size(),
cpu.size(),
platform::errors::InvalidArgument(
"Input tensor number on NPU and CPU devices are not matched. "
"The number on NPU is %d, on CPU is %d",
npu.size(),
cpu.size()));
}
std::vector<void *> npu_ptrs;
npu_ptrs.reserve(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
npu[i].Resize(cpu[i].dims());
npu[i].set_layout(cpu[i].layout());
npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
}
platform::SetNPUDeviceId(place_.device);
platform::NPUEventRecord(events_[i].get(), compute_stream_);
platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
platform::RecordEvent record_event("BufferedReader:MemoryCopy",
platform::TracerEventType::UserDefined,
1);
for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data();
auto npu_ptr = npu_ptrs[i];
auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
if ((platform::is_npu_place(cpu_place))) {
memory::Copy(
place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
} else {
memory::Copy(
place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
platform::NPUStreamSync(stream_.get());
}
npu[i].set_lod(cpu[i].lod());
}
platform::NPUStreamSync(stream_.get());
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) { if (platform::is_mlu_place(place_)) {
TensorVec &mlu = mlu_buffer_[i]; TensorVec &mlu = mlu_buffer_[i];
......
...@@ -25,8 +25,7 @@ ...@@ -25,8 +25,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
...@@ -93,12 +92,6 @@ class BufferedReader : public framework::DecoratedReader { ...@@ -93,12 +92,6 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<std::shared_ptr<platform::CudaEventObject>> events_; std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
aclrtStream compute_stream_;
std::shared_ptr<platform::NpuStreamObject> stream_;
std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
mluStream compute_stream_; mluStream compute_stream_;
std::shared_ptr<platform::MluStreamObject> stream_; std::shared_ptr<platform::MluStreamObject> stream_;
......
...@@ -11,19 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,19 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/run_program_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
/* see [Why use single type kernel] */
REGISTER_OP_NPU_KERNEL(
run_program,
ops::RunProgramOpKernel<paddle::platform::NPUDeviceContext, float>);
REGISTER_OP_NPU_KERNEL(
run_program_grad,
ops::RunProgramGradOpKernel<paddle::platform::NPUDeviceContext, float>);
#endif
...@@ -11,107 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,107 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ScatterNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* index = ctx.Input<phi::DenseTensor>("Ids");
auto* updates = ctx.Input<phi::DenseTensor>("Updates");
bool overwrite = ctx.Attr<bool>("overwrite");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
phi::DenseTensor tmp_tensor(index->type());
const auto index_dims = index->dims();
if (index_dims.size() == 1) {
tmp_tensor.ShareDataWith(*index);
std::vector<int64_t> new_dim = {index_dims[0], 1};
tmp_tensor.Resize(phi::make_ddim(new_dim));
index = &tmp_tensor;
}
const auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto op_func_update = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
runner.Run(dev_ctx.stream());
};
auto op_func_add = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs);
runner.Run(dev_ctx.stream());
};
if (overwrite) {
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
NpuOpRunner::TypeAdapter({*x, *index, *updates},
{*out},
{},
dev_ctx,
op_func_update,
{framework::proto::VarType::INT32,
framework::proto::VarType::INT32,
framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner_update = NpuOpRunner(
"TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
runner_update.Run(dev_ctx.stream());
}
} else {
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
NpuOpRunner::TypeAdapter({*x, *index, *updates},
{*out},
{},
dev_ctx,
op_func_add,
{framework::proto::VarType::INT32,
framework::proto::VarType::INT32,
framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner_add =
NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
runner_add.Run(dev_ctx.stream());
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
scatter,
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
#endif
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
...@@ -41,7 +41,7 @@ class SoftmaxWithCrossEntropyOpMaker ...@@ -41,7 +41,7 @@ class SoftmaxWithCrossEntropyOpMaker
"The outputs value of softmax activation by given the input batch, " "The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation.") "which will be used in backward calculation.")
.AsIntermediate(); .AsIntermediate();
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
AddOutput( AddOutput(
"Backprop", "Backprop",
"(Tensor, default: Tensor<float>), A tensor in same shape with " "(Tensor, default: Tensor<float>), A tensor in same shape with "
...@@ -135,7 +135,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { ...@@ -135,7 +135,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Output(Softmax) should be not null.")); "Output(Softmax) should be not null."));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"),
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -206,10 +206,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { ...@@ -206,10 +206,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Softmax", logits_dims); ctx->SetOutputDim("Softmax", logits_dims);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
ctx->SetOutputDim("Backprop", logits_dims);
ctx->ShareLoD("Logits", /*->*/ "Backprop");
#endif
logits_dims[axis] = 1; logits_dims[axis] = 1;
ctx->SetOutputDim("Loss", logits_dims); ctx->SetOutputDim("Loss", logits_dims);
...@@ -238,7 +235,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { ...@@ -238,7 +235,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(Softmax) should be not null.")); "Input(Softmax) should be not null."));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"),
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -327,7 +324,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -327,7 +324,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetType("softmax_with_cross_entropy_grad");
grad_op->SetInput("Label", this->Input("Label")); grad_op->SetInput("Label", this->Input("Label"));
grad_op->SetInput("Softmax", this->Output("Softmax")); grad_op->SetInput("Softmax", this->Output("Softmax"));
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
grad_op->SetInput("Backprop", this->Output("Backprop")); grad_op->SetInput("Backprop", this->Output("Backprop"));
#endif #endif
grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
...@@ -359,7 +356,7 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad, ...@@ -359,7 +356,7 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradInplaceInferer); ops::SoftmaxWithCrossEntropyGradInplaceInferer);
REGISTER_OP_VERSION(softmax_with_cross_entropy) REGISTER_OP_VERSION(softmax_with_cross_entropy)
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
Add a new attribute [use_softmax] )ROC", Add a new attribute [use_softmax] )ROC",
......
...@@ -127,11 +127,6 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor, ...@@ -127,11 +127,6 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor,
} else { } else {
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor); paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(print_tensor.place())) {
platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
}
#endif
data = cpu_tensor.data<T>(); data = cpu_tensor.data<T>();
} }
......
...@@ -11,50 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,50 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/unsqueeze_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
unsqueeze,
ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze2,
ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze_grad,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, float>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, double>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, plat::float16>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, bool>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int8_t>,
ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, float>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, double>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, plat::float16>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int64_t>);
#endif
...@@ -92,7 +92,7 @@ inline T GetValue(const phi::DenseTensor* x) { ...@@ -92,7 +92,7 @@ inline T GetValue(const phi::DenseTensor* x) {
if (!platform::is_cpu_place(x->place())) { if (!platform::is_cpu_place(x->place())) {
phi::DenseTensor cpu_x; phi::DenseTensor cpu_x;
framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x); framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx = pool.Get(x->place()); const platform::DeviceContext* dev_ctx = pool.Get(x->place());
dev_ctx->Wait(); dev_ctx->Wait();
......
...@@ -147,118 +147,6 @@ class NCCLCommContext { ...@@ -147,118 +147,6 @@ class NCCLCommContext {
}; };
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
// In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class NPUDeviceContext;
#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
#define ENV_RANK_ID "PADDLE_TRAINER_ID"
class HCCLComm {
public:
virtual int ring_id() const = 0;
virtual int nranks() const = 0;
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual HcclComm comm() const = 0;
virtual aclrtStream stream() const = 0;
virtual NPUDeviceContext* dev_context() const = 0;
virtual ~HCCLComm() = default;
};
// A singleton HCCL communicator context reserves communication ring ids
class HCCLCommContext {
public:
static HCCLCommContext& Instance() {
static HCCLCommContext comm_ctx;
return comm_ctx;
}
HCCLComm* CreateHCCLComm(
HcclRootInfo* hccl_id, int nranks, int rank, int dev_id, int ring_id);
// a latter comm with the same dev_id and the same ring_id
// will override the former
HCCLComm* AssignHCCLComm(
HcclComm comm, int nranks, int rank, int dev_id, int ring_id);
// retrieve a communicator by the ring id in multiprocessing mode
HCCLComm* Get(int ring_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id),
0,
platform::errors::InvalidArgument(
"Communicator in ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(),
1,
platform::errors::InvalidArgument(
"One device id should be specified to retrieve from "
"multiple communicators."));
return comm_map_.at(ring_id).begin()->second.get();
}
// retrieve a communicator by the ring id and the device id
HCCLComm* Get(int ring_id, int dev_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id),
0,
platform::errors::InvalidArgument(
"Communicator of ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_GT(
comm_map_.at(ring_id).count(dev_id),
0,
platform::errors::InvalidArgument(
"Communicator at device id %d has not been initialized in ring %d.",
dev_id,
ring_id));
return comm_map_.at(ring_id).at(dev_id).get();
}
// retrieve a communicator by the ring id and place
HCCLComm* Get(int ring_id, Place place) const {
return Get(ring_id, place.device);
}
private:
// Init global hcom
HCCLCommContext() {}
// we may use group feature in the feature
// HCCLCommContext() { InitHcomWorldGroup(); }
HcclComm comm_;
public:
~HCCLCommContext() {}
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
// ring id to dev-HCCLComm
std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
// void InitHcomWorldGroup();
void ReleaseHCCLComms();
DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need // In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global // a communication ring contains BKCL communicators associated to a global
......
...@@ -266,51 +266,5 @@ IPUDeviceContext::~IPUDeviceContext() {} ...@@ -266,51 +266,5 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
platform::GetCurrentNPUContext(&context_);
stream_.reset(new stream::NPUStream(place));
}
NPUDeviceContext::~NPUDeviceContext() {
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event(
"NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
VLOG(4) << "NPU context(" << this << ") Wait";
stream_->Wait();
}
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; }
NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
: place_(place) {
eigen_device_.reset(new Eigen::DefaultDevice());
}
Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -34,10 +34,6 @@ limitations under the License. */ ...@@ -34,10 +34,6 @@ limitations under the License. */
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
#include <cncl.h> #include <cncl.h>
#endif #endif
...@@ -334,11 +330,7 @@ static int ConnectAddr(const std::string& ep, const CommHead head) { ...@@ -334,11 +330,7 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {
} }
// TODO(WANGXI): maybe need to unify this hard code // TODO(WANGXI): maybe need to unify this hard code
#ifdef PADDLE_WITH_ASCEND_CL
#define MAX_COMMUNIQUEID_LEN 4108
#else
#define MAX_COMMUNIQUEID_LEN 1024 #define MAX_COMMUNIQUEID_LEN 1024
#endif
template <typename CommUniqueId> template <typename CommUniqueId>
static void RecvCommID(int conn, CommUniqueId* nccl_id) { static void RecvCommID(int conn, CommUniqueId* nccl_id) {
...@@ -456,9 +448,7 @@ INSTANT_TEMPLATE(ncclUniqueId) ...@@ -456,9 +448,7 @@ INSTANT_TEMPLATE(ncclUniqueId)
#ifdef PADDLE_WITH_XPU_BKCL #ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE(BKCLUniqueId) INSTANT_TEMPLATE(BKCLUniqueId)
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
INSTANT_TEMPLATE(HcclRootInfo)
#endif
#ifdef PADDLE_WITH_CNCL #ifdef PADDLE_WITH_CNCL
INSTANT_TEMPLATE(cnclCliqueId) INSTANT_TEMPLATE(cnclCliqueId)
#endif #endif
......
...@@ -228,9 +228,7 @@ void InitDevices(const std::vector<int> devices) { ...@@ -228,9 +228,7 @@ void InitDevices(const std::vector<int> devices) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
places.emplace_back(platform::IPUPlace(devices[i])); places.emplace_back(platform::IPUPlace(devices[i]));
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
places.emplace_back(platform::NPUPlace(devices[i]));
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
places.emplace_back(platform::MLUPlace(devices[i])); places.emplace_back(platform::MLUPlace(devices[i]));
#endif #endif
......
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
// //
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
namespace paddle { namespace paddle {
...@@ -95,24 +93,14 @@ typename Visitor::result_type VisitPlace(const Place &place, ...@@ -95,24 +93,14 @@ typename Visitor::result_type VisitPlace(const Place &place,
#endif #endif
} }
case phi::AllocationType::NPU: { case phi::AllocationType::NPU: {
#ifdef PADDLE_WITH_ASCEND_CL
platform::NPUPlace p(place.GetDeviceId());
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned")); "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif
} }
case phi::AllocationType::NPUPINNED: { case phi::AllocationType::NPUPINNED: {
#ifdef PADDLE_WITH_ASCEND_CL
platform::NPUPinnedPlace p;
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned")); "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif
} }
case phi::AllocationType::IPU: { case phi::AllocationType::IPU: {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
......
...@@ -33,11 +33,8 @@ static void StreamCallbackFunc(gpuStream_t stream, ...@@ -33,11 +33,8 @@ static void StreamCallbackFunc(gpuStream_t stream,
#endif #endif
#endif #endif
#if PADDLE_WITH_ASCEND_CL
static void StreamCallbackFunc(void *user_data)
#endif
#if PADDLE_WITH_MLU #if PADDLE_WITH_MLU
static void StreamCallbackFunc(void *user_data) static void StreamCallbackFunc(void *user_data)
#endif #endif
{ {
std::unique_ptr<std::function<void()>> func( std::unique_ptr<std::function<void()>> func(
...@@ -75,12 +72,6 @@ void StreamCallbackManager<Stream>::AddCallback( ...@@ -75,12 +72,6 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif #endif
#endif #endif
#if PADDLE_WITH_ASCEND_CL
VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
// TODO(zhiqiu): failed to call aclrtLaunchCallback
NPULaunchCallback(StreamCallbackFunc, func, ACL_CALLBACK_BLOCK, stream_);
#endif
#if PADDLE_WITH_MLU #if PADDLE_WITH_MLU
VLOG(3) << "MLULaunchCallback at stream: " << stream_; VLOG(3) << "MLULaunchCallback at stream: " << stream_;
cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func); cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
...@@ -94,9 +85,6 @@ void StreamCallbackManager<Stream>::Wait() const { ...@@ -94,9 +85,6 @@ void StreamCallbackManager<Stream>::Wait() const {
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_)); PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUStreamSync(stream_);
#endif #endif
{ {
std::lock_guard<std::mutex> lock(mtx_); std::lock_guard<std::mutex> lock(mtx_);
...@@ -112,9 +100,7 @@ template struct StreamCallbackManager<gpuStream_t>; ...@@ -112,9 +100,7 @@ template struct StreamCallbackManager<gpuStream_t>;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
template struct StreamCallbackManager<hipStream_t>; template struct StreamCallbackManager<hipStream_t>;
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
template struct StreamCallbackManager<aclrtStream>;
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
template struct StreamCallbackManager<mluStream>; template struct StreamCallbackManager<mluStream>;
#endif #endif
......
...@@ -26,12 +26,9 @@ ...@@ -26,12 +26,9 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/pybind/eager_generator.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
#endif
#include "paddle/fluid/pybind/eager_generator.h"
// phi // phi
#include "paddle/phi/kernels/declarations.h" #include "paddle/phi/kernels/declarations.h"
...@@ -485,11 +482,6 @@ int main(int argc, char* argv[]) { ...@@ -485,11 +482,6 @@ int main(int argc, char* argv[]) {
return -1; return -1;
} }
#ifdef PADDLE_WITH_ASCEND_CL
auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
ascend_ptr->InitGEForUT();
#endif
std::vector<std::string> headers{ std::vector<std::string> headers{
"<Python.h>", "<Python.h>",
"\"paddle/fluid/platform/enforce.h\"", "\"paddle/fluid/platform/enforce.h\"",
...@@ -557,9 +549,5 @@ int main(int argc, char* argv[]) { ...@@ -557,9 +549,5 @@ int main(int argc, char* argv[]) {
out.close(); out.close();
#ifdef PADDLE_WITH_ASCEND_CL
ge::GEFinalize();
#endif
return 0; return 0;
} }
...@@ -2547,9 +2547,9 @@ void BindImperative(py::module *m_ptr) { ...@@ -2547,9 +2547,9 @@ void BindImperative(py::module *m_ptr) {
}, },
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL) defined(PADDLE_WITH_CNCL)
py::class_<imperative::ParallelContext, py::class_<imperative::ParallelContext,
std::shared_ptr<imperative::ParallelContext>>(m, std::shared_ptr<imperative::ParallelContext>>(m,
"ParallelContext"); "ParallelContext");
...@@ -2630,7 +2630,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -2630,7 +2630,7 @@ void BindImperative(py::module *m_ptr) {
#endif #endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::HeterParallelContext, py::class_<imperative::HeterParallelContext,
imperative::ParallelContext, imperative::ParallelContext,
std::shared_ptr<imperative::HeterParallelContext>>( std::shared_ptr<imperative::HeterParallelContext>>(
......
...@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx, ...@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx,
const void* src, const void* src,
size_t num) { size_t num) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) defined(PADDLE_WITH_MLU)
memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream()); memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -101,9 +101,6 @@ int main(int argc, char** argv) { ...@@ -101,9 +101,6 @@ int main(int argc, char** argv) {
int ret = RUN_ALL_TESTS(); int ret = RUN_ALL_TESTS();
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::AclInstance::Instance().Finalize();
#endif
if (env_str) free(env_str); if (env_str) free(env_str);
if (undefok_str) free(undefok_str); if (undefok_str) free(undefok_str);
return ret; return ret;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册