mv PADDLE_WITH_ASCEND_CL (#52535)

80dd1672 · 张春乔 · GitHub · 29c28e2f · 80dd1672 · 80dd1672
72 changed file
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -52,7 +52,7 @@ void MessageBus::Init(
  }

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU_BKCL)
  // NOTE: To make the brpc is compatible with collective,
  // need release the handler holding the ip address.
  if (addr_ != "") {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2128,12 +2128,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
      // CPUKernel will be executed and a warning will be given at the same
      // time.
      expected_kernel_key.place_ = platform::CPUPlace();
-#ifdef PADDLE_WITH_ASCEND_CL
-      if (SupportNPU()) {
-        auto& dev_ctx = ctx.device_context();
-        expected_kernel_key.place_ = dev_ctx.GetPlace();
-      }
-#endif
+
      if (platform::is_cpu_place(expected_kernel_key.place_)) {
        LOG_FIRST_N(WARNING, 1)
            << "Op(" << type_
@@ -2305,16 +2300,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_iter == kernels.end() &&
-      platform::is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing NPU kernel: " << type_
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
+
 #ifdef PADDLE_WITH_MLU
  if (kernel_iter == kernels.end() &&
      platform::is_mlu_place(expected_kernel_key.place_)) {

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -150,15 +150,6 @@ AmpOperators::AmpOperators()
  unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                unsupported_ops_gpu_bf16.end());
 // NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  auto unsupported_ops_npu_fp16 = std::get<2>(
-      OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
-  unsupported_fp16_ops_->insert(unsupported_ops_npu_fp16.begin(),
-                                unsupported_ops_npu_fp16.end());
-  auto unsupported_ops_npu_bf16 = std::get<2>(
-      OpSupportedInfos("NPU", paddle::framework::proto::VarType::BF16));
-  unsupported_bf16_ops_->insert(unsupported_ops_npu_bf16.begin(),
-                                unsupported_ops_npu_bf16.end());
 #elif defined(PADDLE_WITH_XPU)
  auto unsupported_ops_xpu_fp16 = std::get<2>(
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -34,8 +34,6 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "xpu/refactor/math.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #endif
@@ -270,32 +268,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
 #endif
  }

-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place)) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::DeviceContext* ctx = pool.Get(place);
-    auto dev_ctx = dynamic_cast<platform::NPUDeviceContext*>(ctx);
-    if (data_type == framework::DataTypeTrait<float>::DataType()) {
-      dst_tensor->mutable_data<float>(place);
-    } else if (data_type == framework::DataTypeTrait<double>::DataType()) {
-      dst_tensor->mutable_data<double>(place);
-    } else if (data_type ==
-               framework::DataTypeTrait<platform::float16>::DataType()) {
-      dst_tensor->mutable_data<platform::float16>(place);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Gradient accumulation of data type (%s) on place (%s) is not "
-          "supported in imperative mode",
-          framework::DataTypeToString(data_type),
-          place));
-    }
-    const auto& runner = operators::NpuOpRunner(
-        "Add", {*dst_tensor, src_tensor}, {*dst_tensor}, {});
-    runner.Run(dev_ctx->stream());
-    return;
-  }
-#endif
-
 #ifdef PADDLE_WITH_XPU
  if (platform::is_xpu_place(place)) {
    if (data_type == framework::DataTypeTrait<float>::DataType()) {

--- a/paddle/fluid/imperative/hccl_context.h
+++ b/paddle/fluid/imperative/hccl_context.h
@@ -12,62 +12,3 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/imperative/parallel_context.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace imperative {
-
-class HCCLParallelContext : public ParallelContext {
- public:
-  explicit HCCLParallelContext(const ParallelStrategy& strategy,
-                               const platform::Place& place)
-      : ParallelContext(strategy, place) {}
-
-  ~HCCLParallelContext() override = default;
-
-  void BcastHCCLId(const std::vector<HcclRootInfo>& hccl_ids,
-                   int root,  // NOLINT
-                   int server_fd);
-
-  void Init() override;
-
-  void InitWithRingID(int ring_id) override;
-
-  void AllReduceByStream(const framework::Variable& src,
-                         framework::Variable* dst,
-                         int ring_id,
-                         bool use_calc_stream) override;
-
-  void Broadcast(framework::Variable* src, int ring_id) override;
-
-  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
-
-  void WaitCompute(int ring_id) override;
-
-  void WaitComm(int ring_id) override;
-
-  void SynchronizeCompute() override;
-
- private:
-  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
-  std::vector<std::shared_ptr<platform::NpuStreamObject>> compute_events_;
-
-  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
-  std::vector<std::shared_ptr<platform::NpuEventObject>> comm_events_;
-};
-
-}  //  namespace imperative
-}  //  namespace paddle
-#endif
--- a/paddle/fluid/imperative/heter_ccl_context.cc
+++ b/paddle/fluid/imperative/heter_ccl_context.cc
@@ -42,8 +42,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
    : ParallelContext(strategy, platform::CUDAPlace(device_id))
 #elif PADDLE_WITH_XPU_BKCL
    : ParallelContext(strategy, platform::XPUPlace(device_id))
-#elif PADDLE_WITH_ASCEND_CL
-    : ParallelContext(strategy, platform::NPUPlace(device_id))
 #else
    : ParallelContext(strategy, platform::CPUPlace())
 #endif
@@ -112,11 +110,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
  node_parallel_ctx_ =
      std::make_shared<BKCLParallelContext>(node_strategy_, node_place_);
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  node_place_ = platform::NPUPlace(device_id);
-  node_parallel_ctx_ =
-      std::make_shared<HCCLParallelContext>(node_strategy_, node_place_);
-#endif
 }

 void HeterParallelContext::Init() {

--- a/paddle/fluid/imperative/heter_ccl_context.h
+++ b/paddle/fluid/imperative/heter_ccl_context.h
@@ -24,11 +24,6 @@
 #ifdef PADDLE_WITH_XPU_BKCL
 #include "paddle/fluid/imperative/bkcl_context.h"
 #endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/imperative/hccl_context.h"
-#endif
-
 #include "paddle/fluid/imperative/gloo_context.h"
 #include "paddle/fluid/imperative/parallel_context.h"


--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -458,17 +458,6 @@ PreparedOp PrepareImpl(
    }
  }
 #endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_iter == kernels.end() &&
-      paddle::platform::is_npu_place(fluid_kernel_type.place_)) {
-    VLOG(3) << "missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << fluid_kernel_type
-            << ", fallbacking to CPU one!";
-    fluid_kernel_type.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(fluid_kernel_type);
-  }
-#endif
 #ifdef PADDLE_WITH_IPU
  if (kernel_iter == kernels.end() &&
      paddle::platform::is_ipu_place(fluid_kernel_type.place_)) {

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -31,7 +31,7 @@ namespace imperative {

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
+    defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
  phi::DenseTensor *tensor =
@@ -305,17 +305,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
        "Please recompile or reinstall Paddle with BKCL support."));
 #endif
  } else if (platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    ConcatTensorsWithType(
-        static_cast<const platform::NPUDeviceContext &>(context),
-        dense_tensors_,
-        &dense_contents_,
-        dtype_);
-#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't concat npu grads since it's not compiled with HCCL,"
        "Please recompile or reinstall Paddle with HCCL support."));
-#endif
+
  } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_CNCL
    ConcatTensorsWithType(
@@ -365,17 +358,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
        "Please recompile or reinstall Paddle with BKCL support."));
 #endif
  } else if (platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    SplitTensorsWithType(
-        static_cast<const platform::NPUDeviceContext &>(context),
-        &dense_contents_,
-        &dense_tensors_,
-        dtype_);
-#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't split npu grad since it's not compiled with HCCL,"
        "Please recompile or reinstall Paddle with HCCL support."));
-#endif
+
  } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_CNCL
    SplitTensorsWithType(
@@ -1129,9 +1115,8 @@ void Reducer::FinalizeBackward() {

  if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||      \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
    ProcessUnusedDenseVars();
 #endif
    // Initialize local used vars

--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -46,7 +46,7 @@ namespace imperative {

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
+    defined(PADDLE_WITH_CNCL)

 template <typename T>
 struct DivNRanksFunctor {

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -135,15 +135,10 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
      gc.reset(new framework::CPUGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
    } else if (platform::is_npu_place(place)) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-      gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
-      VLOG(10) << "Created GarbageCollector at " << place;
-#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use NPU device since it's not compiled with NPU,"
          "Please recompile or reinstall Paddle with NPU support."));
-#endif
+
    } else if (platform::is_ipu_place(place)) {
 #if defined(PADDLE_WITH_IPU)
      gc.reset(new framework::IPUGarbageCollector(place, 0));
@@ -303,12 +298,8 @@ void Tracer::TraceOpImpl(const std::string& type,
          "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
    } else if (platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      platform::SetNPUDeviceId(place.device);
-#else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with NPU if use NPUPlace."));
-#endif
    } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
      platform::SetMLUDeviceId(place.device);

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -49,12 +49,6 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Scale",
             "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
             "operator.");
-#ifdef PADDLE_WITH_ASCEND_CL
-    AddInput("FloatStatus",
-             "(Tensor) 1-dim tensor of shape [8], allocated by "
-             "alloc_float_status op")
-        .AsDispensable();
-#endif
    AddOutput("Out",
              "(Tensors) The scaled output tensor of "
              "check_finite_and_unscale operator.")

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -57,26 +57,7 @@ struct FillConstantVisitor {
  void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
                                       std::is_same<T, int16_t>::value)>::type
                 * = nullptr) const {
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
-      tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
-
-      const auto &runner =
-          NpuOpRunner("FillD",
-                      {tensor_tmp},
-                      {*tensor_},
-                      {{"dims", phi::vectorize(tensor_->dims())}});
-      auto stream =
-          context_.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    } else {
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
-    }
-#elif defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    if (platform::is_mlu_place(context_.GetPlace())) {
      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
    } else {
@@ -235,12 +216,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    // Init the continuous space
    size_t offset = 0;
    if (context.Attr<bool>("copy_data")) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      framework::VisitDataType(
-          dtype,
-          FillConstantVisitor<DeviceContext>(
-              dev_ctx, fused_tensor, static_cast<float>(0.0), dtype, context));
-#endif
      for (size_t i = 0; i < in_var_names.size(); ++i) {
        size_t len = static_cast<size_t>(in_tensors[i]->numel());
        auto sub_tensor = fused_tensor->Slice(
@@ -534,25 +509,6 @@ REGISTER_OPERATOR(coalesce_tensor,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-#if defined(PADDLE_WITH_ASCEND_CL)
-REGISTER_OP_CUDA_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
-#endif
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-REGISTER_OP_NPU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
-#endif
-
 #if defined(PADDLE_WITH_MLU)
 REGISTER_OP_MLU_KERNEL(
    coalesce_tensor,

--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -44,10 +44,6 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the allgather result");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
-        .SetDefault("tag");
-#endif
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -16,10 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allgather_op.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

@@ -27,51 +23,8 @@ template <typename T>
 class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
-    int nranks = comm->nranks();
-
-    framework::DDim out_dims = in->dims();
-    out_dims[0] *= nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    uint64_t send_numel = in->numel();
-    void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
-    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
-
-    aclrtStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    VLOG(3) << "begin hccl allgather, parameter is: "
-            << ", group is " << group << ", ring_id is " << ring_id
-            << ", nranks is " << nranks;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        platform::dynload::HcclAllGather(send_buff,
-                                         recv_buff,
-                                         send_numel,
-                                         dtype,
-                                         comm->comm(),
-                                         reinterpret_cast<void *>(stream)));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -34,10 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -34,10 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -24,9 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/api/include/tensor.h"

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif

@@ -44,17 +43,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-#endif
-
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-DECLARE_bool(hccl_check_nan);
-#endif
-
 namespace paddle {
 namespace operators {

@@ -150,177 +142,12 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  template <typename T, typename DeviceContext>          \
  class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};

-#if defined(PADDLE_WITH_ASCEND_CL)
-// return true if found_nan or return false;
-inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
-                        aclrtStream stream,
-                        const phi::DenseTensor* in) {
-  phi::DenseTensor out(in->type());
-
-  phi::DenseTensor mean(in->type());
-  mean.Resize({1});
-  mean.mutable_data<float>(dev_ctx.GetPlace());
-  std::vector<int> axes;
-  for (int i = 0; i < in->dims().size(); ++i) {
-    axes.push_back(i);
-  }
-
-  std::vector<float> vec;
-  try {
-    const auto& runner_mean = paddle::operators::NpuOpRunner(
-        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
-    paddle::framework::TensorToVector(mean, dev_ctx, &vec);
-  } catch (...) {
-    LOG(WARNING) << "ContainsNan catch exception";
-    return true;
-  }
-
-  VLOG(4) << "reducemeand result:" << vec[0];
-  if (std::isnan(static_cast<float>(vec[0]))) {
-    LOG(WARNING) << "ContainsNan detects nan";
-    return true;
-  }
-
-  if (std::isinf(static_cast<float>(vec[0]))) {
-    LOG(WARNING) << "ContainsNan detects inf";
-  }
-
-  return false;
-}
-
-#endif
-
 template <ReduceType red_type, typename T>
 class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    if (ctx.HasInput("Cond")) {
-      auto cond = ctx.Input<phi::DenseTensor>("Cond");
-      auto place = cond->place();
-      PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
-                        true,
-                        platform::errors::PreconditionNotMet(
-                            "The input `cond` tensor should be on cpu place"));
-      PADDLE_ENFORCE_EQ(cond->numel(),
-                        1,
-                        platform::errors::PreconditionNotMet(
-                            "The input `cond` should be shape [1]"));
-      if (!cond->data<bool>()[0]) {
-        VLOG(4) << "Skip all reduce Op since cond is 0";
-        return;
-      }
-    }
-
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
-    int64_t numel = in->numel();
-
-    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    out->mutable_data<T>(in->dims(), ctx.GetPlace());
-    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = dev_ctx->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
-    switch (red_type) {
-      case kRedSum:
-        hccl_red_type = HCCL_REDUCE_SUM;
-        break;
-
-      case kRedMax:
-        hccl_red_type = HCCL_REDUCE_MAX;
-        break;
-
-      case kRedMin:
-        hccl_red_type = HCCL_REDUCE_MIN;
-        break;
-
-      case kRedProd:
-        hccl_red_type = HCCL_REDUCE_PROD;
-        break;
-
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid reduce type: %d", red_type));
-    }
-
-    VLOG(3) << "hccl allreduce, parameter is: "
-            << "input num: " << in->dims() << "dtype: " << dtype
-            << "hccl_red_type: " << hccl_red_type << ", group is: " << group
-            << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
-            << ", out_size:" << out->memory_size()
-            << ", use_calc_stream:" << ctx.Attr<bool>("use_calc_stream")
-            << ", stream:" << stream;
-
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-
-    bool found_nan = false;
-
-    auto d_type = framework::TransToProtoVarType(in->dtype());
-    switch (d_type) {
-      case framework::proto::VarType::FP16: {
-        break;
-      }
-      case framework::proto::VarType::FP32: {
-        if (FLAGS_hccl_check_nan) {
-          VLOG(3) << "prepare to FoundNanInf";
-          // NOTE: performance relating, DO NOT REMOVE!
-          ContainsNan(*dev_ctx, dev_ctx->stream(), in);
-        }
-        break;
-      }
-      default:
-        break;
-    }
-
-    if (found_nan) {
-      T inf = static_cast<T>(std::numeric_limits<float>::infinity());
-      VLOG(4) << "fill input data constant inf";
-      auto dims = in->dims();
-      auto mutable_in = const_cast<phi::DenseTensor*>(in);
-      FillNpuTensorWithConstant<T>(mutable_in, inf);
-      mutable_in->Resize(dims);
-    }
-
-    VLOG(3) << "hccl allreduce, parameter is: "
-            << "input num: " << numel << "dtype: " << dtype
-            << "hccl_red_type: " << hccl_red_type << ", group is: " << group
-            << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
-            << ", out_size:" << out->memory_size();
-
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        platform::dynload::HcclAllReduce(sendbuff,
-                                         recvbuff,
-                                         numel,
-                                         dtype,
-                                         hccl_red_type,
-                                         comm->comm(),
-                                         reinterpret_cast<void*>(stream)));
-
-    out->Resize(in->dims());
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };

@@ -616,10 +443,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the allreduced result.");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
-        .SetDefault("tag");
-#endif
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 // Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
 // DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
 // Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0

--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -42,10 +42,7 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(0);
    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
-        .SetDefault("tag");
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -14,10 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_broadcast_op.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

@@ -25,58 +21,8 @@ template <typename T>
 class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    int numel = x->numel();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
-
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int root = ctx.Attr<int>("root");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-
-    VLOG(3) << "begin hccl broadcast, parameter is: "
-            << "root " << root << ", group is " << group
-            << ", comm: " << comm->comm() << ", stream: " << stream;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
-
-    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
-            << phi::product(out->dims());
-
-    dev_ctx->Wait();
-
-    if (out != x) {
-      framework::TensorCopy(*static_cast<const phi::DenseTensor*>(x),
-                            place,
-                            *platform::DeviceContextPool::Instance().Get(place),
-                            static_cast<phi::DenseTensor*>(out));
-    }
-    dev_ctx->Wait();
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -21,11 +21,6 @@ namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "hccl/hccl.h"
-#include "hccl/hccl_types.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#endif

 namespace paddle {
 namespace operators {
@@ -48,52 +43,9 @@ class CCommInitOpAscend : public framework::OperatorBase {
    auto var = scope.FindVar(Input("X"));
    PADDLE_ENFORCE_NOT_NULL(
        var, platform::errors::InvalidArgument("Input con not be empty."));
-#if defined(PADDLE_WITH_ASCEND_CL)
-    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
-
-    int rank_ids = Attr<int>("rank_ids");
-    int rank_id = Attr<int>("rank");
-    int rid = Attr<int>("ring_id");
-    int device_id = place.device;
-    if (Attr<int>("device_id") >= 0) {
-      device_id = Attr<int>("device_id");
-    }
-    platform::HCCLCommContext::Instance().CreateHCCLComm(
-        hccl_id, rank_ids, rank_id, device_id, rid);
-
-    //  Build comm
-    float* buff;
-    int32_t size = 20;
-    std::vector<float> input(size, 0);
-    for (int32_t idx = 0; idx < size; idx++) {
-      input[idx] = 1.0;
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::RecordedNPUMalloc(
-        reinterpret_cast<void**>(&buff), size * sizeof(float), device_id));
-    platform::NPUMemcpySync(reinterpret_cast<void*>(buff),
-                            input.data(),
-                            size * sizeof(float),
-                            ACL_MEMCPY_HOST_TO_DEVICE,
-                            size * sizeof(float));
-    VLOG(3) << "Build buff data successful.";
-
-    aclrtStream stream = nullptr;
-    auto comm = paddle::platform::HCCLCommContext::Instance().Get(rid, place);
-    if (rank_id == 0) {
-      stream = comm->stream();
-    } else {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream));
-    // Synchronize stream to find hccl error in time.
-    platform::NPUStreamSync(stream);
-    VLOG(3) << "Build connection successful.";
-#else
+
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -27,83 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-#ifdef PADDLE_WITH_ASCEND_CL
-
-static void GenHCCLID(std::vector<HcclRootInfo>* hccl_ids) {
-  constexpr int timeout = 2 * 60 + 10;  // 2MSL+10s
-  constexpr int retry_time = 1;
-  for (size_t i = 0; i < hccl_ids->size(); ++i) {
-    bool failed = true;
-    for (auto retry_times = 0; retry_times * retry_time < timeout;
-         ++retry_times) {
-      auto err = platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i]);
-      if (err == 0) {
-        failed = false;
-        break;
-      }
-      std::this_thread::sleep_for(std::chrono::seconds(retry_time));
-      LOG(WARNING) << "HcclGetRootInfo failed, err is: " << err << ", retry "
-                   << retry_times << " times";
-    }
-    if (failed) {
-      PADDLE_THROW(platform::errors::External("HcclGetRootInfo failed!"));
-    }
-  }
-}
-
-static void CopyHCCLIDToVar(const std::vector<HcclRootInfo>& hccl_ids,
-                            std::function<std::string(size_t)> func,
-                            const framework::Scope& scope) {
-  for (size_t i = 0; i < hccl_ids.size(); ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
-    auto hccl_id = var->GetMutable<HcclRootInfo>();
-    memcpy(hccl_id, &hccl_ids[i], sizeof(HcclRootInfo));
-  }
-}
-
-class CGenHCCLIdOp : public framework::OperatorBase {
- public:
-  CGenHCCLIdOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    int rank = Attr<int>("rank");
-    int ring_id = Attr<int>("ring_id");
-
-    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
-      return Output("Out");
-    };
-
-    std::string endpoint = Attr<std::string>("endpoint");
-    int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
-
-    std::vector<HcclRootInfo> hccl_ids;
-    hccl_ids.resize(1);
-
-    if (rank == 0) {
-      GenHCCLID(&hccl_ids);
-      std::vector<std::string> endpoint_list =
-          Attr<std::vector<std::string>>("other_endpoints");
-      platform::SendBroadCastCommID(endpoint_list, &hccl_ids, ring_id);
-    } else {
-      platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids, ring_id);
-    }
-
-    CopyHCCLIDToVar(hccl_ids, func, scope);
-  }
-};
-
-#else
-
 class CGenHCCLIdOp : public framework::OperatorBase {
 public:
  CGenHCCLIdOp(const std::string& type,
@@ -116,8 +39,6 @@ class CGenHCCLIdOp : public framework::OperatorBase {
               const platform::Place& dev_place) const override {}
 };

-#endif
-
 class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif

@@ -44,9 +44,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-#endif
-
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #endif
@@ -134,86 +131,8 @@ template <ReduceType red_type, typename T>
 class CReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
-    int64_t numel = in->numel();
-
-    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    int root_id = ctx.Attr<int>("root_id");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int rank_id = comm->rank();
-
-    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
-    switch (red_type) {
-      case kRedSum:
-        hccl_red_type = HCCL_REDUCE_SUM;
-        break;
-
-      case kRedMax:
-        hccl_red_type = HCCL_REDUCE_MAX;
-        break;
-
-      case kRedMin:
-        hccl_red_type = HCCL_REDUCE_MIN;
-        break;
-
-      case kRedProd:
-        hccl_red_type = HCCL_REDUCE_PROD;
-        break;
-
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid reduce type: %d", red_type));
-    }
-
-    VLOG(3) << "begin hccl reduce, parameter is: "
-            << "input num: " << numel << "root_id: " << root_id
-            << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
-            << ", group is: " << group;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        platform::dynload::HcclAllReduce(sendbuff,
-                                         recvbuff,
-                                         numel,
-                                         dtype,
-                                         hccl_red_type,
-                                         comm->comm(),
-                                         reinterpret_cast<void*>(stream)));
-
-    if (rank_id != root_id) {
-      auto npu_place = place;
-      memory::Copy(npu_place,
-                   reinterpret_cast<void*>(out->data<T>()),
-                   npu_place,
-                   reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                   numel * sizeof(T),
-                   stream);
-    }
-
-    out->Resize(in->dims());
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };

@@ -433,10 +352,7 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the reduced result.");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
-        .SetDefault("tag");
-#endif
+
    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
    AddAttr<bool>(
        "use_calc_stream",

--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -50,10 +50,7 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("nranks",
                 "Total trainer count of the distributed training job")
        .SetDefault(1);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
-        .SetDefault("tag");
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -14,10 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

@@ -25,59 +21,8 @@ template <typename T>
 class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
-    int nranks = comm->nranks();
-
-    auto out_dims = in->dims();
-    PADDLE_ENFORCE_EQ(out_dims[0] % nranks,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's "
-                          "dim[0] (%d) should be divisible by nranks(%d)",
-                          out_dims[0],
-                          nranks));
-
-    out_dims[0] = out_dims[0] / nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    uint64_t recv_numel = in->numel() / nranks;
-
-    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
-
-    aclrtStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-    VLOG(3) << "begin hccl reduce scatter, parameter is: "
-            << "recv_numel: " << recv_numel << "dtype: " << dtype
-            << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        platform::dynload::HcclReduceScatter(inputPtr,
-                                             outputPtr,
-                                             recv_numel,
-                                             dtype,
-                                             HCCL_REDUCE_SUM,
-                                             comm->comm(),
-                                             reinterpret_cast<void*>(stream)));
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -34,10 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -47,17 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {

    platform::GpuStreamSync(dev_ctx->stream());

-#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on npu place only for now."));
-
-    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::NPUStreamSync(dev_ctx->stream());
-
 #elif defined(PADDLE_WITH_CNCL)
    auto place = ctx.GetPlace();
    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #endif

-#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif

@@ -45,19 +45,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {

    platform::GpuStreamSync(stream);

-#elif defined(PADDLE_WITH_ASCEND_CL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync comm stream op can run on npu place only for "
-                          "now, but we got %s, please check the environment.",
-                          place.DebugString()));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
-    platform::NPUStreamSync(stream);
-
 #elif defined(PADDLE_WITH_CNCL)
    auto place = ctx.GetPlace();
    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place),

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -32,10 +32,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -30,144 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-#ifdef PADDLE_WITH_ASCEND_CL
-
-class GenHCCLIdOp : public framework::OperatorBase {
- public:
-  GenHCCLIdOp(const std::string& type,
-              const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    int trainer_id = Attr<int>("trainer_id");
-    std::string endpoint = trainers[trainer_id];
-
-    PADDLE_ENFORCE_GE(
-        trainer_id,
-        0,
-        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
-                                          "valid range is [0, trainer_size)"));
-    PADDLE_ENFORCE_LT(
-        trainer_id,
-        static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
-
-    int hccl_comm_num = Attr<int>("hccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE_GT(
-          trainers.size(),
-          1,
-          platform::errors::PreconditionNotMet(
-              "The number of collective trainers %llu <= 1", trainers.size()));
-      PADDLE_ENFORCE_GT(
-          inter_nranks,
-          1,
-          platform::errors::PreconditionNotMet(
-              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-              inter_nranks));
-      PADDLE_ENFORCE_EQ(
-          trainers.size() % inter_nranks,
-          0,
-          platform::errors::PreconditionNotMet(
-              "The number of trainers %llu mod inter_nranks %d is not equal 0",
-              trainers.size(),
-              inter_nranks));
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", hccl_comm_num:" << hccl_comm_num
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    int server_fd = -1;
-
-    /// 1. init flat
-    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
-    if (trainer_id == 0) {
-      // server endpoints
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(
-          flat_endpoints.begin(), trainers.begin() + 1, trainers.end());
-      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
-    } else {
-      server_fd = CreateListenSocket(endpoint);
-      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
-    }
-
-    /// 2. hierarchical inter ncclid
-    func = platform::GetHierarchicalInterHCCLVarName;
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-
-      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
-    } else if (inter_trainer_id > 0) {
-      VLOG(1) << "Hierarchical inter ring";
-      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
-    }
-
-    /// 3. hierarchical exter ncclid
-    func = platform::GetHierarchicalExterHCCLVarName;
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-
-      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
-    } else if (exter_trainer_id > 0) {
-      VLOG(1) << "Hierarchical exter ring";
-      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
-    }
-
-    // close socket server
-    if (trainer_id != 0) {
-      CloseSocket(server_fd);
-    }
-  }
-};
-
-#else
 class GenHCCLIdOp : public framework::OperatorBase {
 public:
  GenHCCLIdOp(const std::string& type,
@@ -180,8 +42,6 @@ class GenHCCLIdOp : public framework::OperatorBase {
               const platform::Place& dev_place) const override {}
 };

-#endif
-
 class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {

--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -30,10 +30,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/split.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 DECLARE_int32(get_host_by_name_time);

 namespace paddle {

--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
@@ -42,10 +42,7 @@ class MpAllReduceSumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the allreduced result in model parallel.");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
-        .SetDefault("tag");
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -50,10 +50,7 @@ class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the allgather result");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
-        .SetDefault("tag");
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
@@ -24,67 +24,8 @@ template <typename T>
 class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    int64_t numel = in->numel();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
-
-    int rank = ctx.Attr<int>("rank");
-    int ring_id = ctx.Attr<int>("ring_id");
-    std::string group =
-        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
-    int nranks = comm->nranks();
-
-    PADDLE_ENFORCE_EQ(rank,
-                      comm->rank(),
-                      platform::errors::InvalidArgument(
-                          "rank: %s should equal to %s", rank, comm->rank()));
-    PADDLE_ENFORCE_EQ(
-        (numel % nranks),
-        0,
-        platform::errors::InvalidArgument(
-            "The input numel (%d) must be divisible by nranks(%d)",
-            numel,
-            nranks));
-
-    framework::DDim dims = in->dims();
-    out->mutable_data<T>(dims, place);
-
-    int64_t send_numel = numel / nranks;
-    int offset = send_numel * rank;
-
-    void *send_buff =
-        reinterpret_cast<void *>(const_cast<T *>(in->data<T>()) + offset);
-    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
-
-    aclrtStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    VLOG(3) << "begin hccl allgather, parameter is: "
-            << ", group is " << group << ", ring_id is " << ring_id
-            << ", nranks is " << nranks << ", rankid is " << rank;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        platform::dynload::HcclAllGather(send_buff,
-                                         recv_buff,
-                                         send_numel,
-                                         dtype,
-                                         comm->comm(),
-                                         reinterpret_cast<void *>(stream)));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -98,12 +98,7 @@ class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
    AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
        .SetDefault(5);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
-        .SetDefault("tag");
-    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
-        .SetDefault(0);
-#endif
+
    AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
        .SetDefault(std::vector<int>());
    AddAttr<bool>(

--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
@@ -22,57 +22,8 @@ template <typename T>
 class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(out->dims(), ctx.GetPlace());
-    int num = ctx.Attr<int>("num");
-    int id = ctx.Attr<int>("id");
-    int recv_numel = out->numel() / num;
-    int offset = recv_numel * id;
-
-    void* ptr =
-        reinterpret_cast<void*>(const_cast<T*>(out->data<T>()) + offset);
-    int numel = recv_numel;
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
-
-    int ring_id = ctx.Attr<int>("ring_id");
-
-    auto place = ctx.GetPlace();
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int nranks = comm->nranks();
-    int peer = ctx.Attr<int>("peer");
-
-    PADDLE_ENFORCE_EQ(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The nranks must be 2, but (%d)", nranks));
-
-    int root = peer;
-
-    VLOG(3) << "begin hccl recv, parameter is: "
-            << "ring_id:" << ring_id << ", nranks:" << nranks
-            << ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
-            << ", dtype:" << dtype << ", root:" << root
-            << ", comm: " << comm->comm() << ", stream: " << stream;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -65,12 +65,7 @@ class PartialSendMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
        .SetDefault(0);
    AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
-        .SetDefault("tag");
-    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
-        .SetDefault(0);
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc
@@ -22,52 +22,8 @@ template <typename T>
 class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    int num = ctx.Attr<int>("num");
-    int id = ctx.Attr<int>("id");
-    int send_numel = x->numel() / num;
-    int offset = send_numel * id;
-
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()) + offset);
-    int numel = send_numel;
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int nranks = comm->nranks();
-    int rank = comm->rank();
-
-    PADDLE_ENFORCE_EQ(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The nranks must be 2, but (%d)", nranks));
-
-    int root = rank;
-
-    VLOG(3) << "begin hccl send, parameter is: "
-            << "root " << root << ", comm: " << comm->comm()
-            << ", stream: " << stream;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -87,12 +87,7 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
    AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
        .SetDefault(5);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
-        .SetDefault("tag");
-    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
-        .SetDefault(0);
-#endif
+
    AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
        .SetDefault(std::vector<int>());
    AddAttr<bool>(

--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -14,9 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/recv_v2_op.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/api/include/tensor.h"

@@ -27,59 +24,8 @@ template <typename T>
 class CRecvOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(out->dims(), ctx.GetPlace());
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
-    int numel = out->numel();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto map = distributed::ProcessGroupMapFromGid::getInstance();
-    if (map->has(ring_id)) {
-      // Use ProcessGroup
-      distributed::ProcessGroup* pg = map->get(ring_id);
-      std::vector<phi::DenseTensor> out_tensor;
-      out_tensor.emplace_back(*out);
-      auto task = pg->Recv(out_tensor, 0);
-      return;
-    }
-    auto place = ctx.GetPlace();
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int nranks = comm->nranks();
-    int peer = ctx.Attr<int>("peer");
-
-    PADDLE_ENFORCE_EQ(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The nranks must be 2, but (%d)", nranks));
-
-    int root = peer;
-
-    VLOG(3) << "begin hccl recv, parameter is: "
-            << "ring_id:" << ring_id << ", nranks:" << nranks
-            << ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
-            << ", dtype:" << dtype << ", root:" << root
-            << ", comm: " << comm->comm() << ", stream: " << stream;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -61,12 +61,7 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
        .SetDefault(0);
    AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
-        .SetDefault("tag");
-    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
-        .SetDefault(0);
-#endif
+
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")

--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -14,9 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/send_v2_op.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/api/include/tensor.h"

@@ -27,56 +24,8 @@ template <typename T>
 class CSendOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    int numel = x->numel();
-    HcclDataType dtype =
-        platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
-
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto map = distributed::ProcessGroupMapFromGid::getInstance();
-    if (map->has(ring_id)) {
-      // Use ProcessGroup
-      distributed::ProcessGroup* pg = map->get(ring_id);
-      std::vector<phi::DenseTensor> in_tensor;
-      in_tensor.push_back(*x);
-      auto task = pg->Send(in_tensor, 1);
-      return;
-    }
-    auto place = ctx.GetPlace();
-    auto comm =
-        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-
-    aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int nranks = comm->nranks();
-    int rank = comm->rank();
-
-    PADDLE_ENFORCE_EQ(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The nranks must be 2, but (%d)", nranks));
-
-    int root = rank;
-
-    VLOG(3) << "begin hccl send, parameter is: "
-            << "root " << root << ", comm: " << comm->comm()
-            << ", stream: " << stream;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
-        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
-
-#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));
-#endif
  }
 };


--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -31,10 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -84,12 +84,6 @@ class ConditionalOp : public framework::OperatorBase {
      res = cpu_tensor.data<bool>()[0];
 #endif
    } else if (platform::is_npu_place(ips[0]->place())) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      phi::DenseTensor cpu_tensor;
-      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
-      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
-      res = cpu_tensor.data<bool>()[0];
-#endif
    } else if (platform::is_xpu_place(ips[0]->place())) {
 #ifdef PADDLE_WITH_XPU
      phi::DenseTensor cpu_tensor;

--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -228,9 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) {
  // platform::is_npu_place(cond.place()) or
  // platform::is_xpu_place(cond.place()) is true
  std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
  framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
  PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
+
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #endif
@@ -182,84 +181,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
 };
 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-template <typename T>
-class ConcatFunctor<platform::NPUDeviceContext, T> {
- public:
-  void operator()(const platform::NPUDeviceContext& context,
-                  const std::vector<phi::DenseTensor>& input,
-                  int axis,
-                  phi::DenseTensor* output) {
-    int dev_id = context.GetPlace().GetDeviceId();
-    platform::NPUDeviceGuard guard(dev_id);
-
-    std::vector<std::string> names;
-    for (size_t i = 0; i < input.size(); ++i) {
-      names.push_back("x" + std::to_string(i));
-    }
-    NpuOpRunner runner{
-        "ConcatD",
-        {input},
-        {*output},
-        {{"concat_dim", axis}, {"N", static_cast<int>(input.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(context.stream());
-  }
-};
-
-template <typename T>
-class SplitFunctor<platform::NPUDeviceContext, T> {
- public:
-  void operator()(const platform::NPUDeviceContext& context,
-                  const phi::DenseTensor& input,
-                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-                  const int axis,
-                  std::vector<phi::DenseTensor*>* outputs) {
-    if (input.numel() == 0) {
-      return;
-    }
-
-    size_t num = outputs->size();
-
-    int input_rows = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      input_rows *= dim_0[i];
-    }
-
-    int input_cols = 0;
-
-    std::vector<int64_t> output_cols(outputs->size());
-    for (size_t i = 0; i < num; ++i) {
-      int t_cols = ref_inputs[i]->numel() / input_rows;
-      input_cols += t_cols;
-      output_cols[i] = t_cols;
-    }
-    auto npu_place = context.GetPlace();
-
-    // computation
-    for (int k = 0; k < input_rows; ++k) {
-      const T* src_ptr = input.data<T>() + k * input_cols;
-      int col_idx = 0;
-      for (size_t j = 0; j < num; ++j) {
-        int col_len = output_cols[j];
-        auto* out_tensor = outputs->at(j);
-        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->data<T>() + k * col_len;
-          memory::Copy(npu_place,
-                       dst_ptr,
-                       npu_place,
-                       src_ptr + col_idx,
-                       sizeof(T) * col_len,
-                       context.stream());
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-};
-#endif
-
 #ifdef PADDLE_WITH_MLU
 template <typename T>
 class ConcatFunctor<platform::MLUDeviceContext, T> {
@@ -369,14 +290,6 @@ DEFINE_XPU_FUNCTOR(float)
 DEFINE_XPU_FUNCTOR(platform::float16)
 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-#define DEFINE_NPU_FUNCTOR(type)                                  \
-  template class ConcatFunctor<platform::NPUDeviceContext, type>; \
-  template class SplitFunctor<platform::NPUDeviceContext, type>;
-
-FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
-#endif
-
 #ifdef PADDLE_WITH_MLU
 #define DEFINE_MLU_FUNCTOR(type)                                  \
  template class ConcatFunctor<platform::MLUDeviceContext, type>; \

--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -123,34 +123,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    MemcpyH2DInferShapeFunctor);

-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d,
-                               float,
-                               ops::MemcpyH2DKernel,
-                               double,
-                               ops::MemcpyH2DKernel,
-                               int8_t,
-                               ops::MemcpyH2DKernel,
-                               uint8_t,
-                               ops::MemcpyH2DKernel,
-                               int,
-                               ops::MemcpyH2DKernel,
-                               int64_t,
-                               ops::MemcpyH2DKernel,
-                               bool,
-                               ops::MemcpyH2DKernel,
-                               paddle::platform::bfloat16,
-                               ops::MemcpyH2DKernel,
-                               paddle::platform::complex<float>,
-                               ops::MemcpyH2DKernel,
-                               paddle::platform::complex<double>,
-                               ops::MemcpyH2DKernel,
-                               plat::float16,
-                               ops::MemcpyH2DKernel,
-                               int16_t,
-                               ops::MemcpyH2DKernel);
-#endif
-
 #ifdef PADDLE_WITH_IPU
 REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d,
                               float,

--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -145,19 +145,3 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    MemcpyInferShapeFunctor);
-
-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy,
-                               float,
-                               ops::MemcpyKernel,
-                               double,
-                               ops::MemcpyKernel,
-                               int,
-                               ops::MemcpyKernel,
-                               int64_t,
-                               ops::MemcpyKernel,
-                               bool,
-                               ops::MemcpyKernel,
-                               plat::float16,
-                               ops::MemcpyKernel);
-#endif
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -61,14 +61,7 @@ class MemcpyFunctor {
          lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
    } else if (dst_place_type_ == DeviceType::CPU) {
      framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
-#ifdef PADDLE_WITH_ASCEND_CL
-    } else if (dst_place_type_ == DeviceType::NPU) { /* npu_pin->npu */
-      framework::TensorCopy(
-          lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
-    } else if (dst_place_type_ == DeviceType::NPU_PINNED) { /* npu->npu_pin */
-      framework::TensorCopy(
-          lod_tensor, platform::NPUPinnedPlace(), dev_ctx_, &out_tensor);
-#endif
+
 #ifdef PADDLE_WTIH_CUSTOM_DEVICE
    } else if (dst_place_type_ == DeviceType::CUSTOM_DEVICE) {
      framework::TensorCopy(

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
  }
 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place_)) {
-    int dev_idx = place_.device;
-    compute_stream_ =
-        ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
-                                            .Get(place_)))
-            ->stream();
-    events_.resize(buffer_size);
-    for (auto &event : events_) {
-      event = platform::NpuEventResourcePool::Instance().New(dev_idx);
-    }
-    stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
-  }
-#endif
-
 #ifdef PADDLE_WITH_MLU
  if (platform::is_mlu_place(place_)) {
    int dev_idx = place_.device;
@@ -275,56 +260,6 @@ void BufferedReader::ReadAsync(size_t i) {
    }
 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(place_)) {
-      TensorVec &npu = npu_buffer_[i];
-      if (npu.empty()) {
-        npu.resize(cpu.size());
-      } else {
-        PADDLE_ENFORCE_EQ(
-            npu.size(),
-            cpu.size(),
-            platform::errors::InvalidArgument(
-                "Input tensor number on NPU and CPU devices are not matched. "
-                "The number on NPU is %d, on CPU is %d",
-                npu.size(),
-                cpu.size()));
-      }
-
-      std::vector<void *> npu_ptrs;
-      npu_ptrs.reserve(cpu.size());
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        npu[i].Resize(cpu[i].dims());
-        npu[i].set_layout(cpu[i].layout());
-        npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
-      }
-
-      platform::SetNPUDeviceId(place_.device);
-      platform::NPUEventRecord(events_[i].get(), compute_stream_);
-      platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
-
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        auto cpu_place = cpu[i].place();
-        auto cpu_ptr = cpu[i].data();
-        auto npu_ptr = npu_ptrs[i];
-        auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
-        if ((platform::is_npu_place(cpu_place))) {
-          memory::Copy(
-              place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-        } else {
-          memory::Copy(
-              place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-          platform::NPUStreamSync(stream_.get());
-        }
-        npu[i].set_lod(cpu[i].lod());
-      }
-      platform::NPUStreamSync(stream_.get());
-    }
-#endif
-
 #ifdef PADDLE_WITH_MLU
    if (platform::is_mlu_place(place_)) {
      TensorVec &mlu = mlu_buffer_[i];

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -25,8 +25,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
+
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
@@ -93,12 +92,6 @@ class BufferedReader : public framework::DecoratedReader {
  std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-  aclrtStream compute_stream_;
-  std::shared_ptr<platform::NpuStreamObject> stream_;
-  std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
-#endif
-
 #ifdef PADDLE_WITH_MLU
  mluStream compute_stream_;
  std::shared_ptr<platform::MluStreamObject> stream_;

--- a/paddle/fluid/operators/run_program_op_npu.cc
+++ b/paddle/fluid/operators/run_program_op_npu.cc
@@ -11,19 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/operators/run_program_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-/* see [Why use single type kernel] */
-REGISTER_OP_NPU_KERNEL(
-    run_program,
-    ops::RunProgramOpKernel<paddle::platform::NPUDeviceContext, float>);
-REGISTER_OP_NPU_KERNEL(
-    run_program_grad,
-    ops::RunProgramGradOpKernel<paddle::platform::NPUDeviceContext, float>);
-#endif
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -11,107 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScatterNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* index = ctx.Input<phi::DenseTensor>("Ids");
-    auto* updates = ctx.Input<phi::DenseTensor>("Updates");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    phi::DenseTensor tmp_tensor(index->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {index_dims[0], 1};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func_update = [](const std::vector<phi::DenseTensor>& inputs,
-                             const std::vector<phi::DenseTensor>& outputs,
-                             const NPUAttributeMap& attrs,
-                             const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner =
-          NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-    auto op_func_add = [](const std::vector<phi::DenseTensor>& inputs,
-                          const std::vector<phi::DenseTensor>& outputs,
-                          const NPUAttributeMap& attrs,
-                          const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner =
-          NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (overwrite) {
-      if (framework::TransToProtoVarType(x->dtype()) ==
-          framework::proto::VarType::INT64) {
-        NpuOpRunner::TypeAdapter({*x, *index, *updates},
-                                 {*out},
-                                 {},
-                                 dev_ctx,
-                                 op_func_update,
-                                 {framework::proto::VarType::INT32,
-                                  framework::proto::VarType::INT32,
-                                  framework::proto::VarType::INT32},
-                                 {framework::proto::VarType::INT32});
-      } else {
-        const auto& runner_update = NpuOpRunner(
-            "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
-        runner_update.Run(dev_ctx.stream());
-      }
-    } else {
-      if (framework::TransToProtoVarType(x->dtype()) ==
-          framework::proto::VarType::INT64) {
-        NpuOpRunner::TypeAdapter({*x, *index, *updates},
-                                 {*out},
-                                 {},
-                                 dev_ctx,
-                                 op_func_add,
-                                 {framework::proto::VarType::INT32,
-                                  framework::proto::VarType::INT32,
-                                  framework::proto::VarType::INT32},
-                                 {framework::proto::VarType::INT32});
-      } else {
-        const auto& runner_add =
-            NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
-        runner_add.Run(dev_ctx.stream());
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    scatter,
-    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-#endif
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -41,7 +41,7 @@ class SoftmaxWithCrossEntropyOpMaker
        "The outputs value of softmax activation by given the input batch, "
        "which will be used in backward calculation.")
        .AsIntermediate();
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    AddOutput(
        "Backprop",
        "(Tensor, default: Tensor<float>), A tensor in same shape with "
@@ -135,7 +135,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                      true,
                      platform::errors::InvalidArgument(
                          "Output(Softmax) should be not null."));
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"),
                      true,
                      platform::errors::InvalidArgument(
@@ -206,10 +206,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
    }

    ctx->SetOutputDim("Softmax", logits_dims);
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
-    ctx->SetOutputDim("Backprop", logits_dims);
-    ctx->ShareLoD("Logits", /*->*/ "Backprop");
-#endif
+
    logits_dims[axis] = 1;
    ctx->SetOutputDim("Loss", logits_dims);

@@ -238,7 +235,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
                      true,
                      platform::errors::InvalidArgument(
                          "Input(Softmax) should be not null."));
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"),
                      true,
                      platform::errors::InvalidArgument(
@@ -327,7 +324,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
    grad_op->SetType("softmax_with_cross_entropy_grad");
    grad_op->SetInput("Label", this->Input("Label"));
    grad_op->SetInput("Softmax", this->Output("Softmax"));
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    grad_op->SetInput("Backprop", this->Output("Backprop"));
 #endif
    grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
@@ -359,7 +356,7 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                  ops::SoftmaxWithCrossEntropyGradInplaceInferer);

 REGISTER_OP_VERSION(softmax_with_cross_entropy)
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    .AddCheckpoint(
        R"ROC(
              Add a new attribute [use_softmax] )ROC",

--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -127,11 +127,6 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor,
  } else {
    platform::CPUPlace cpu_place;
    paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor);
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(print_tensor.place())) {
-      platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
-    }
-#endif
    data = cpu_tensor.data<T>();
  }


--- a/paddle/fluid/operators/unsqueeze_op_npu.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
@@ -11,50 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    unsqueeze,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    unsqueeze2,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, float>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, double>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, bool>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<plat::NPUDeviceContext, int64_t>);
-#endif
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -92,7 +92,7 @@ inline T GetValue(const phi::DenseTensor* x) {
  if (!platform::is_cpu_place(x->place())) {
    phi::DenseTensor cpu_x;
    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_MLU)
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
    dev_ctx->Wait();

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -147,118 +147,6 @@ class NCCLCommContext {
 };
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-// In order to apply hierarchical communication with HCCL, we need
-// a communication ring contains HCCL communicators associated to a global
-// HCCLUniqueId. E.g. for a hierarchical case,
-//
-//    11 - 12   21 - 22
-//     |    |    |    |
-//    13 - 14 - 23 - 24
-//          |    |
-//    31 - 32 - 41 - 42
-//     |    |    |    |
-//    33 - 34   43 - 44
-//
-// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
-// (31,32,33,34), (41,42,43,44) as bottoms respectively.
-//
-// We could also use a single communication ring for the flatten case
-//
-// The HCCLComm instance is created and reversed in the HCCLCommContext
-// singleton with a global user specified group id.
-class NPUDeviceContext;
-
-#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
-#define ENV_RANK_ID "PADDLE_TRAINER_ID"
-
-class HCCLComm {
- public:
-  virtual int ring_id() const = 0;
-  virtual int nranks() const = 0;
-  virtual int rank() const = 0;
-  virtual int device_id() const = 0;
-  virtual HcclComm comm() const = 0;
-  virtual aclrtStream stream() const = 0;
-  virtual NPUDeviceContext* dev_context() const = 0;
-  virtual ~HCCLComm() = default;
-};
-
-// A singleton HCCL communicator context reserves communication ring ids
-class HCCLCommContext {
- public:
-  static HCCLCommContext& Instance() {
-    static HCCLCommContext comm_ctx;
-    return comm_ctx;
-  }
-
-  HCCLComm* CreateHCCLComm(
-      HcclRootInfo* hccl_id, int nranks, int rank, int dev_id, int ring_id);
-  // a latter comm with the same dev_id and the same ring_id
-  // will override the former
-  HCCLComm* AssignHCCLComm(
-      HcclComm comm, int nranks, int rank, int dev_id, int ring_id);
-
-  // retrieve a communicator by the ring id in multiprocessing mode
-  HCCLComm* Get(int ring_id) const {
-    PADDLE_ENFORCE_GT(
-        comm_map_.count(ring_id),
-        0,
-        platform::errors::InvalidArgument(
-            "Communicator in ring id %d has not been initialized.", ring_id));
-    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "One device id should be specified to retrieve from "
-                          "multiple communicators."));
-    return comm_map_.at(ring_id).begin()->second.get();
-  }
-
-  // retrieve a communicator by the ring id and the device id
-  HCCLComm* Get(int ring_id, int dev_id) const {
-    PADDLE_ENFORCE_GT(
-        comm_map_.count(ring_id),
-        0,
-        platform::errors::InvalidArgument(
-            "Communicator of ring id %d has not been initialized.", ring_id));
-    PADDLE_ENFORCE_GT(
-        comm_map_.at(ring_id).count(dev_id),
-        0,
-        platform::errors::InvalidArgument(
-            "Communicator at device id %d has not been initialized in ring %d.",
-            dev_id,
-            ring_id));
-    return comm_map_.at(ring_id).at(dev_id).get();
-  }
-
-  // retrieve a communicator by the ring id and place
-  HCCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, place.device);
-  }
-
- private:
-  // Init global hcom
-  HCCLCommContext() {}
-  // we may use group feature in the feature
-  // HCCLCommContext() { InitHcomWorldGroup(); }
-
-  HcclComm comm_;
-
- public:
-  ~HCCLCommContext() {}
-
-  std::once_flag once_flag_;
-  std::mutex comm_map_mutex_;
-  // ring id to dev-HCCLComm
-  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
-
-  // void InitHcomWorldGroup();
-  void ReleaseHCCLComms();
-
-  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
-};
-#endif
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 // In order to apply hierarchical communication with BKCL, we need
 // a communication ring contains BKCL communicators associated to a global

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -266,51 +266,5 @@ IPUDeviceContext::~IPUDeviceContext() {}

 #endif

-#ifdef PADDLE_WITH_ASCEND_CL
-NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
-  NPUDeviceGuard guard(place_.device);
-  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
-  // NOTE(zhiqiu): Usually, no need to create context explicitly,
-  // ACL creates a default context which contains 1 default stream
-  // and 1 sync strean after aclrtSetDevice.
-  platform::GetCurrentNPUContext(&context_);
-  stream_.reset(new stream::NPUStream(place));
-}
-
-NPUDeviceContext::~NPUDeviceContext() {
-  // NPUDeviceGuard guard(place_.device);
-  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
-}
-
-void NPUDeviceContext::Wait() const {
-  platform::RecordEvent record_event(
-      "NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
-  VLOG(4) << "NPU context(" << this << ")  Wait";
-  stream_->Wait();
-}
-
-aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
-
-const Place& NPUDeviceContext::GetPlace() const { return place_; }
-
-aclrtContext NPUDeviceContext::context() const { return context_; }
-
-NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
-    : place_(place) {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
-
-#endif
-
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -34,10 +34,6 @@ limitations under the License. */
 #include "xpu/bkcl.h"
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 #if defined(PADDLE_WITH_CNCL)
 #include <cncl.h>
 #endif
@@ -334,11 +330,7 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {
 }

 // TODO(WANGXI): maybe need to unify this hard code
-#ifdef PADDLE_WITH_ASCEND_CL
-#define MAX_COMMUNIQUEID_LEN 4108
-#else
 #define MAX_COMMUNIQUEID_LEN 1024
-#endif

 template <typename CommUniqueId>
 static void RecvCommID(int conn, CommUniqueId* nccl_id) {
@@ -456,9 +448,7 @@ INSTANT_TEMPLATE(ncclUniqueId)
 #ifdef PADDLE_WITH_XPU_BKCL
 INSTANT_TEMPLATE(BKCLUniqueId)
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-INSTANT_TEMPLATE(HcclRootInfo)
-#endif
+
 #ifdef PADDLE_WITH_CNCL
 INSTANT_TEMPLATE(cnclCliqueId)
 #endif

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -228,9 +228,7 @@ void InitDevices(const std::vector<int> devices) {
 #ifdef PADDLE_WITH_IPU
    places.emplace_back(platform::IPUPlace(devices[i]));
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-    places.emplace_back(platform::NPUPlace(devices[i]));
-#endif
+
 #ifdef PADDLE_WITH_MLU
    places.emplace_back(platform::MLUPlace(devices[i]));
 #endif

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -19,8 +19,6 @@ limitations under the License. */

 #include "paddle/fluid/platform/enforce.h"
 //
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif

 #include "paddle/phi/common/place.h"
 namespace paddle {
@@ -95,24 +93,14 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
    }
    case phi::AllocationType::NPU: {
-#ifdef PADDLE_WITH_ASCEND_CL
-      platform::NPUPlace p(place.GetDeviceId());
-      return visitor(p);
-#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
      return typename Visitor::result_type();
-#endif
    }
    case phi::AllocationType::NPUPINNED: {
-#ifdef PADDLE_WITH_ASCEND_CL
-      platform::NPUPinnedPlace p;
-      return visitor(p);
-#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
      return typename Visitor::result_type();
-#endif
    }
    case phi::AllocationType::IPU: {
 #ifdef PADDLE_WITH_IPU

--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -33,11 +33,8 @@ static void StreamCallbackFunc(gpuStream_t stream,
 #endif
 #endif

-#if PADDLE_WITH_ASCEND_CL
-        static void StreamCallbackFunc(void *user_data)
-#endif
 #if PADDLE_WITH_MLU
-            static void StreamCallbackFunc(void *user_data)
+        static void StreamCallbackFunc(void *user_data)
 #endif
 {
  std::unique_ptr<std::function<void()>> func(
@@ -75,12 +72,6 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 #endif

-#if PADDLE_WITH_ASCEND_CL
-  VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
-  // TODO(zhiqiu): failed to call aclrtLaunchCallback
-  NPULaunchCallback(StreamCallbackFunc, func, ACL_CALLBACK_BLOCK, stream_);
-#endif
-
 #if PADDLE_WITH_MLU
  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
  cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
@@ -94,9 +85,6 @@ void StreamCallbackManager<Stream>::Wait() const {
 #endif
 #ifdef PADDLE_WITH_MLU
  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  NPUStreamSync(stream_);
 #endif
  {
    std::lock_guard<std::mutex> lock(mtx_);
@@ -112,9 +100,7 @@ template struct StreamCallbackManager<gpuStream_t>;
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-template struct StreamCallbackManager<aclrtStream>;
-#endif
+
 #ifdef PADDLE_WITH_MLU
 template struct StreamCallbackManager<mluStream>;
 #endif

--- a/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
@@ -26,12 +26,9 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/eager_generator.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-#endif
-#include "paddle/fluid/pybind/eager_generator.h"

 // phi
 #include "paddle/phi/kernels/declarations.h"
@@ -485,11 +482,6 @@ int main(int argc, char* argv[]) {
    return -1;
  }

-#ifdef PADDLE_WITH_ASCEND_CL
-  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
-  ascend_ptr->InitGEForUT();
-#endif
-
  std::vector<std::string> headers{
      "<Python.h>",
      "\"paddle/fluid/platform/enforce.h\"",
@@ -557,9 +549,5 @@ int main(int argc, char* argv[]) {

  out.close();

-#ifdef PADDLE_WITH_ASCEND_CL
-  ge::GEFinalize();
-#endif
-
  return 0;
 }
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2547,9 +2547,9 @@ void BindImperative(py::module *m_ptr) {
      },
      py::call_guard<py::gil_scoped_release>());

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_CNCL)
  py::class_<imperative::ParallelContext,
             std::shared_ptr<imperative::ParallelContext>>(m,
                                                           "ParallelContext");
@@ -2630,7 +2630,7 @@ void BindImperative(py::module *m_ptr) {
 #endif

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU_BKCL)
  py::class_<imperative::HeterParallelContext,
             imperative::ParallelContext,
             std::shared_ptr<imperative::HeterParallelContext>>(

--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx,
                            const void* src,
                            size_t num) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+    defined(PADDLE_WITH_MLU)
  memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
  PADDLE_THROW(

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -101,9 +101,6 @@ int main(int argc, char** argv) {

  int ret = RUN_ALL_TESTS();

-#ifdef PADDLE_WITH_ASCEND_CL
-  paddle::platform::AclInstance::Instance().Finalize();
-#endif
  if (env_str) free(env_str);
  if (undefok_str) free(undefok_str);
  return ret;