Delete the /paddle/fluid/platform/device/npu directory (#52384)

* Delete the /paddle/fluid/platform/device/npu directory * clear Cmakelists * Try removing npu in the header file

Delete the /paddle/fluid/platform/device/npu directory (#52384)
* Delete the /paddle/fluid/platform/device/npu directory * clear Cmakelists * Try removing npu in the header file
69436bf5 · jjyaoao · GitHub · 3784ae63 · 69436bf5 · 69436bf5
196 changed file
--- a/paddle/fluid/distributed/collective/process_group_custom.h
+++ b/paddle/fluid/distributed/collective/process_group_custom.h
@@ -24,7 +24,6 @@
 #include "paddle/fluid/distributed/collective/custom_ccl_tools.h"
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/distributed/collective/process_group_with_stream.h"
-#include "paddle/fluid/platform/device/npu/npu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -20,7 +20,6 @@
 #include "paddle/phi/common/amp_type_traits.h"

 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"

--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -18,7 +18,6 @@
 #include <thread>

 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/phi/backends/device_manager.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -35,7 +35,6 @@
 #include "xpu/refactor/math.h"
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"

--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -18,7 +18,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/imperative/hccl_context.h
+++ b/paddle/fluid/imperative/hccl_context.h
@@ -19,8 +19,6 @@
 #include <vector>

 #include "paddle/fluid/imperative/parallel_context.h"
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
-#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"

 namespace paddle {
 namespace framework {
@@ -39,7 +37,7 @@ class HCCLParallelContext : public ParallelContext {

  ~HCCLParallelContext() override = default;

-  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids,
+  void BcastHCCLId(const std::vector<HcclRootInfo>& hccl_ids,
                   int root,  // NOLINT
                   int server_fd);


--- a/paddle/fluid/memory/allocation/buddy_allocator.h
+++ b/paddle/fluid/memory/allocation/buddy_allocator.h
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/memory/allocation/system_allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {

--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif

--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -16,7 +16,6 @@

 #include <string>

-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {

--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -21,7 +21,6 @@

 #include "acl/acl.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {

--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #ifdef PADDLE_WITH_MLU

--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the Licnse. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the Licnse. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"

 namespace paddle {

--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 DECLARE_int32(min_loss_scaling);


--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the Licnse. */

 #include "paddle/fluid/operators/arg_min_max_op_base.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/arg_min_max_op_base.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/fluid/platform/float16.h"

 namespace paddle {

--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 #include <memory>
 #include <string>

-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/clip_by_norm_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -22,7 +22,6 @@
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
 #ifdef PADDLE_WITH_MLU

--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -36,7 +36,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -36,7 +36,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/api/include/tensor.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
@@ -46,7 +45,6 @@ limitations under the License. */
 #endif

 #if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 #if defined(PADDLE_WITH_CNCL)

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 // Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1

--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -25,7 +25,6 @@ class Scope;
 #include "hccl/hccl.h"
 #include "hccl/hccl_types.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/collective/c_embedding_op.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -45,7 +45,6 @@ limitations under the License. */
 #endif

 #if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 #if defined(PADDLE_WITH_CNCL)

--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -36,7 +36,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifndef PADDLE_FLUID_OPERATORS_COLLECTIVE_C_SYNC_COMM_STREAM_OP_H_
+#define PADDLE_FLUID_OPERATORS_COLLECTIVE_C_SYNC_COMM_STREAM_OP_H_
+
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
@@ -19,10 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #endif
@@ -89,3 +88,4 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {

 }  // namespace operators
 }  // namespace paddle
+#endif  // PADDLE_FLUID_OPERATORS_COLLECTIVE_C_SYNC_COMM_STREAM_OP_H_
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -34,7 +34,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -32,7 +32,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 DECLARE_int32(get_host_by_name_time);

--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/partial_allgather_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/partial_recv_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/send_v2_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/api/include/tensor.h"

--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/api/include/tensor.h"

--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -33,7 +33,6 @@ limitations under the License. */

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif

 namespace f = paddle::framework;

--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"

 namespace paddle {

--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/crop_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -12,7 +12,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/box_coder.h"

 namespace paddle {

--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detection/density_prior_box_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detection/iou_similarity_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detection/prior_box_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
@@ -12,7 +12,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/expand_as_v2_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/expand_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/expand_v2_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/fill_zeros_like_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/flatten_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/fluid/platform/device_context.h"

 namespace paddle {

--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/operators/interpolate_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the Licnse. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <cmath>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"

 namespace paddle {

--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ b/paddle/fluid/operators/masked_select_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/common/data_type.h"

 namespace phi {

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"

--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/matmul_v2_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/fluid/platform/float16.h"

 namespace paddle {

--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #if (CANN_VERSION_CODE >= 504000)

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"

--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"

 namespace paddle {

--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/pooling.h"

 namespace paddle {

--- a/paddle/fluid/operators/range_op_npu.cc
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/range_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -26,8 +26,6 @@
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #endif
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"

--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/operators/unsqueeze_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/tensor_utils.h"

 namespace paddle {

--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -10,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/seed_op_npu.cc
+++ b/paddle/fluid/operators/seed_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/seed_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/set_value_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"

 namespace paddle {

--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"


--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -13,7 +13,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"

 namespace paddle {

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
 #include "paddle/phi/kernels/funcs/softmax.h"

--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-// #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"

 namespace paddle {

--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/strided_slice.h"


--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -14,8 +14,6 @@ limitations under the Licnse. */

 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #if (CANN_VERSION_CODE >= 504000)

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -13,7 +13,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/tile_op_functor.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -13,7 +13,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/expand_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/unsqueeze_op_npu.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/unsqueeze_op.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ b/paddle/fluid/operators/unstack_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/variant.h"

--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -10,11 +10,6 @@ if(WITH_XPU)
  add_subdirectory(xpu)
 endif()

-# NPU
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  add_subdirectory(npu)
-endif()
-
 # IPU
 if(WITH_IPU)
  add_subdirectory(ipu)

--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -26,8 +26,6 @@ limitations under the License. */
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif

 #ifdef PADDLE_WITH_MLU

--- a/paddle/fluid/platform/device/npu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/npu/CMakeLists.txt
-# NPU
-
-add_subdirectory(dynload)
-
-if(WITH_ASCEND)
-  cc_library(
-    ascend_npu_info
-    SRCS ascend_npu_info.cc
-    DEPS gflags glog enforce atlas_acl)
-endif()
-
-if(WITH_ASCEND_CL)
-  cc_library(
-    npu_info
-    SRCS npu_info.cc
-    DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
-  cc_library(
-    npu_resource_pool
-    SRCS npu_resource_pool.cc
-    DEPS npu_info)
-  cc_library(
-    npu_stream
-    SRCS npu_stream.cc
-    DEPS enforce stream_callback_manager)
-  cc_library(
-    npu_collective_helper
-    SRCS npu_collective_helper.cc
-    DEPS npu_stream npu_info data_type)
-  cc_library(
-    npu_op_runner
-    SRCS npu_op_runner.cc
-    DEPS operator npu_info)
-endif()
-
-# every source file that includes "dnnl.h" must depends on mkldnn
-# or, the first one should depends on mkldnn
-if(WITH_MKLDNN)
-  add_dependencies(npu_collective_helper mkldnn)
-endif()
--- a/paddle/fluid/platform/device/npu/ascend_npu_info.cc
+++ b/paddle/fluid/platform/device/npu/ascend_npu_info.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
-
-#include <glog/logging.h>
-
-#include "acl/acl_rt.h"
-
-namespace paddle {
-namespace platform {
-namespace ascend {
-
-int NPUDevice::GetDeviceCount() {
-  uint32_t count = 0;
-  aclError status = aclrtGetDeviceCount(&count);
-  if (status != 0) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "aclrtGetDeviceCount error code: %d", status));
-    return -1;
-  }
-
-  return count;
-}
-
-}  // namespace ascend
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device/npu/ascend_npu_info.h
+++ b/paddle/fluid/platform/device/npu/ascend_npu_info.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-namespace paddle {
-namespace platform {
-namespace ascend {
-
-class NPUDevice {
- public:
-  //! Get the total number of XPU devices in system.
-  static int GetDeviceCount();
-};
-
-}  // namespace ascend
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt
-if(WITH_ASCEND_CL)
-  cc_library(
-    npu_hccl
-    SRCS hccl.cc
-    DEPS dynamic_loader warpctc)
-endif()
--- a/paddle/fluid/platform/device/npu/dynload/hccl.cc
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag hccl_dso_flag;
-void *hccl_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-#if HCCL_VERSION_CODE >= 2212
-HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-#endif
-
-#if HCCL_VERSION_CODE >= 2703
-HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-#include <hccl/hccl.h>
-#include <hccl/hccl_types.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag hccl_dso_flag;
-extern void* hccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using HCCL_func = decltype(&::__name);                             \
-      std::call_once(hccl_dso_flag, []() {                               \
-        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
-      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern DynLoad__##__name __name
-
-#define HCCL_RAND_ROUTINE_EACH(__macro) \
-  __macro(HcclReduceScatter);           \
-  __macro(HcclCommDestroy);             \
-  __macro(HcclAllReduce);               \
-  __macro(HcclCommInitRootInfo);        \
-  __macro(HcclGetRootInfo);             \
-  __macro(HcclBroadcast);               \
-  __macro(HcclCommInitClusterInfo);     \
-  __macro(HcclAllGather);
-
-HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
-
-#if HCCL_VERSION_CODE >= 2212
-#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
-HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
-#endif
-
-#if HCCL_VERSION_CODE >= 2703
-#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
-  __macro(HCCLSend);                               \
-  __macro(HCCLRecv);
-HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/enforce_npu.h
+++ b/paddle/fluid/platform/device/npu/enforce_npu.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <string>
-
-#include "acl/acl.h"
-#include "hccl/hccl_types.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-namespace details {
-template <typename T>
-struct NPUStatusType {};
-
-#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
-  template <>                                       \
-  struct NPUStatusType<type> {                      \
-    using Type = type;                              \
-    static constexpr Type kSuccess = success_value; \
-  }
-
-DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
-DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
-}  // namespace details
-
-inline std::string build_npu_error_msg(aclError stat) {
-  std::ostringstream sout;
-  sout << " ACL error, the error code is : " << stat << ". ";
-  return sout.str();
-}
-
-inline std::string build_npu_error_msg(HcclResult stat) {
-  std::ostringstream sout;
-  sout << " HCCL error, the error code is : " << stat << ". ";
-  return sout.str();
-}
-
-#define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
-  do {                                                         \
-    auto __cond__ = (COND);                                    \
-    using __NPU_STATUS_TYPE__ = decltype(__cond__);            \
-    constexpr auto __success_type__ =                          \
-        ::paddle::platform::details::NPUStatusType<            \
-            __NPU_STATUS_TYPE__>::kSuccess;                    \
-    if (UNLIKELY(__cond__ != __success_type__)) {              \
-      auto __summary__ = ::paddle::platform::errors::External( \
-          ::paddle::platform::build_npu_error_msg(__cond__));  \
-      __THROW_ERROR_INTERNAL__(__summary__);                   \
-    }                                                          \
-  } while (0)
-
-}  // namespace platform
-}  // namespace paddle
-#endif  // PADDLE_WITH_ASCEND_CL
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-#include <stdio.h>
-
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <typeindex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-#include "paddle/fluid/platform/float16.h"
-
-#define HCCL_ID_VARNAME "HCCLID"
-
-namespace paddle {
-namespace platform {
-
-inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
-  if (type == framework::proto::VarType::FP32) {
-    return HCCL_DATA_TYPE_FP32;
-  } else if (type == framework::proto::VarType::FP16) {
-    return HCCL_DATA_TYPE_FP16;
-  } else if (type == framework::proto::VarType::INT64) {
-    return HCCL_DATA_TYPE_INT64;
-  } else if (type == framework::proto::VarType::INT32) {
-    return HCCL_DATA_TYPE_INT32;
-  } else if (type == framework::proto::VarType::INT8) {
-    return HCCL_DATA_TYPE_INT8;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "This datatype in hccl is not supported."));
-  }
-}
-
-inline HcclDataType ToHCCLDataType(phi::DataType type) {
-  if (type == phi::DataType::FLOAT32) {
-    return HCCL_DATA_TYPE_FP32;
-  } else if (type == phi::DataType::FLOAT16) {
-    return HCCL_DATA_TYPE_FP16;
-  } else if (type == phi::DataType::INT64) {
-    return HCCL_DATA_TYPE_INT64;
-  } else if (type == phi::DataType::INT32) {
-    return HCCL_DATA_TYPE_INT32;
-  } else if (type == phi::DataType::INT8) {
-    return HCCL_DATA_TYPE_INT8;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "This datatype in hccl is not supported."));
-  }
-}
-
-// NOTE(minqiyang): according to the ncclGroupEnd documentations:
-// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// ncclGroupEnd will wait for all communicators to be initialized, which will
-// cause blocking problem when a runtime_error was thrown, so try only guard
-// HCCL actions when use it.
-
-// class HCCLGroupGuard {
-//  public:
-//   static std::mutex &HCCLMutex() {
-//     static std::mutex mtx;
-//     return mtx;
-//   }
-
-//   inline HCCLGroupGuard() {
-//     HCCLMutex().lock();
-//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
-//   }
-
-//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
-//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
-//     HCCLMutex().unlock();
-//   }
-// };
-
-struct HCCLContext {
-  std::unique_ptr<NPUDeviceContext> ctx_;
-  HcclComm comm_;
-
-  explicit HCCLContext(int dev_id)
-      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
-
-  aclrtStream stream() const { return ctx_->stream(); }
-  HcclComm comm() const { return comm_; }
-
-  int device_id() const { return ctx_->GetPlace().device; }
-};
-
-struct HCCLContextMap {
-  std::unordered_map<int, HCCLContext> contexts_;
-  std::vector<int> order_;
-
-  explicit HCCLContextMap(const std::vector<platform::Place> &places,
-                          HcclRootInfo *hccl_id = nullptr,
-                          size_t num_trainers = 1,
-                          size_t trainer_id = 0) {
-    PADDLE_ENFORCE_EQ(!places.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The HCCL place should not be empty."));
-    order_.reserve(places.size());
-    for (auto &p : places) {
-      int dev_id = p.device;
-      order_.emplace_back(dev_id);
-      contexts_.emplace(dev_id, HCCLContext(dev_id));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(),
-        contexts_.size(),
-        platform::errors::Unavailable("HCCL Context Map does not support "
-                                      "contain two or more same device."));
-
-    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
-    // if num_trainers == 1, should create a new nccl id for local comms.
-    if (num_trainers == 1 && hccl_id == nullptr) {
-      // we do not know how to tackle this situation under hccl
-      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
-      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
-      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
-    } else {
-      PADDLE_ENFORCE_NOT_NULL(
-          hccl_id,
-          platform::errors::InvalidArgument("The HCCL id should not be null."));
-      {
-        int nranks = num_trainers * order_.size();
-        // HCCLGroupGuard gurad;
-        for (size_t i = 0; i < order_.size(); ++i) {
-          int gpu_id = order_[i];
-          int rank;
-          if (order_.size() > 1) {
-            rank = trainer_id * order_.size() + i;
-          } else {
-            rank = trainer_id;
-          }
-          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
-                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-          SetNPUDeviceId(gpu_id);
-          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
-              nranks, hccl_id, rank, comms.get() + i));
-        }
-      }
-    }
-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
-    }
-  }
-
-  HCCLContextMap(const HCCLContextMap &other) = delete;
-  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
-
-  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
-
-  NPUDeviceContext *DevCtx(platform::Place p) const { return DevCtx(p.device); }
-
-  const HCCLContext &at(platform::Place p) const { return this->at(p.device); }
-
-  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
-
-  void WaitAll() {
-    for (auto &p : contexts_) {
-      p.second.ctx_->Wait();
-    }
-  }
-};
-
-inline std::string GetFlatHCCLVarName(size_t pos) {
-  if (pos == 0) {
-    return HCCL_ID_VARNAME;
-  }
-  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
-}
-
-inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
-  return string::Sprintf(
-      "Hierarchical_exter_%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
-}
-inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
-  return string::Sprintf(
-      "Hierarchical_inter_%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
-}
-
-class HCCLCommunicator {
- public:
-  HCCLCommunicator() {}
-  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
-
-  HCCLContextMap *DefaultFlatCtx() const {
-    if (flat_ctxs_.size() == 0) {
-      return nullptr;
-    }
-
-    return flat_ctxs_[0].get();
-  }
-
-  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
-    return &flat_ctxs_;
-  }
-
-  HCCLContextMap *GetFlatCtx(size_t run_order) const {
-    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
-  }
-
-  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
-                                   bool use_hierarchical_allreduce) const {
-    if (!use_hierarchical_allreduce) {
-      return GetFlatCtx(run_order);
-    }
-
-    return GetHierarchicalInterCtx(run_order);
-  }
-
-  /*
-   When nccl inits nccl comm using ncclCommInitAll, it meets error when
-   allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
-   create a new nccl comm for sync_batch_norm_op. And these codes should be
-   polished with a unified nccl management.
-  */
-
-  HCCLContextMap *GetSyncBatchNormCtx(
-      framework::Scope *scope, const std::vector<platform::Place> &places) {
-    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
-    if (hccl_id_var != nullptr) {
-      return DefaultFlatCtx();
-    }
-
-    if (sync_batch_norm_ctx_.get() == nullptr) {
-      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
-    }
-    return sync_batch_norm_ctx_.get();
-  }
-
-  void InitFlatCtxs(const std::vector<platform::Place> &places,
-                    const std::vector<HcclRootInfo *> &hccl_ids,
-                    size_t trainers_num,
-                    size_t trainer_id) {
-    if (hccl_ids.size() == 0) {
-      auto ptr = new platform::HCCLContextMap(places);
-      VLOG(1) << "init local trainer";
-      flat_ctxs_.emplace_back(ptr);
-    } else {
-      for (size_t i = 0; i < hccl_ids.size(); i++) {
-        auto ptr = new platform::HCCLContextMap(
-            places, hccl_ids[i], trainers_num, trainer_id);
-        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
-        flat_ctxs_.emplace_back(ptr);
-      }
-    }
-
-    // as Executor have no way to use ncclComm created by ParallelExecutor,
-    // we assign all flatten contexts to HCCLCommContext to fix.
-    int nranks = static_cast<int>(trainers_num * places.size());
-    int nrings = static_cast<int>(flat_ctxs_.size());
-    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
-      for (size_t p = 0; p < places.size(); ++p) {
-        int rank = trainer_id * places.size() + p;
-        int dev_id = places[p].device;
-        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
-        HCCLCommContext::Instance().AssignHCCLComm(
-            ctx.comm_, nranks, rank, dev_id, ring_id);
-      }
-    }
-  }
-
-  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
-                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
-                            size_t trainers_num,
-                            size_t trainer_id,
-                            size_t inter_trainers_num,
-                            size_t exter_trainers_num) {
-    PADDLE_ENFORCE_EQ(trainers_num,
-                      inter_trainers_num * exter_trainers_num,
-                      platform::errors::InvalidArgument(
-                          "trainers_num:%llu != inter_trainers_num:%llu * "
-                          "exter_trainers_num:%llu",
-                          trainers_num,
-                          inter_trainers_num,
-                          exter_trainers_num));
-
-    PADDLE_ENFORCE_GT(
-        inter_trainers_num,
-        1,
-        platform::errors::InvalidArgument(
-            "The inter_trainers_num:%llu should be larger than 1.",
-            inter_trainers_num));
-
-    int inter_trainer_id = trainer_id % inter_trainers_num;
-    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
-      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
-              << ", comm no:" << i;
-      auto local = new HCCLContextMap(
-          places, inter_hccl_ids[i], inter_trainers_num, inter_trainer_id);
-
-      h_inter_ctxs_.emplace_back(local);
-    }
-
-    int exter_trainer_id = -1;
-    if (trainer_id % inter_trainers_num == 0) {
-      exter_trainer_id = trainer_id / inter_trainers_num;
-    }
-
-    if (exter_trainer_id >= 0) {
-      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
-        auto ex = new HCCLContextMap(
-            places, exter_hccl_ids[i], exter_trainers_num, exter_trainer_id);
-        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
-                << ", comm no:" << i;
-        h_exter_ctxs_.emplace_back(ex);
-      }
-    }
-  }
-
-  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
-
-  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Hierarchical ctxs should be initialized firstly!"));
-    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
-  }
-
-  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Hierarchical ctxs should be initialized firstly!"));
-    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
-  }
-
-  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
-    return &h_inter_ctxs_;
-  }
-
-  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
-    return &h_exter_ctxs_;
-  }
-
- protected:
-  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
-  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
-
-  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
-  // And h_exter_ctxs_ can support multi comm too.
-  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
-  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
-
-  // just used for sync_batch_norm op.
-  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
-};
-
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc
+++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include <utility>
-
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-
-namespace paddle {
-namespace platform {
-
-class HCCLCommImpl : public HCCLComm {
- public:
-  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
-  int ring_id() const override { return ring_id_; }
-
-  void set_nranks(int nranks) { nranks_ = nranks; }
-  int nranks() const override { return nranks_; }
-
-  void set_rank(int rank) { rank_ = rank; }
-  int rank() const override { return rank_; }
-
-  int device_id() const override { return dev_ctx_->GetPlace().device; }
-
-  ~HCCLCommImpl() {
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
-  }
-
-  void set_comm(HcclComm comm) { comm_ = comm; }
-  HcclComm comm() const override { return comm_; }
-
-  aclrtStream stream() const override { return dev_ctx_->stream(); }
-
-  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
-    dev_ctx_ = std::move(dev_ctx);
-  }
-  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
-
- private:
-  int ring_id_;
-  int nranks_;
-  int rank_;
-  HcclComm comm_;
-  std::unique_ptr<NPUDeviceContext> dev_ctx_;
-};
-
-HCCLComm* HCCLCommContext::CreateHCCLComm(
-    HcclRootInfo* hccl_id, int nranks, int rank, int dev_id, int ring_id) {
-  PADDLE_ENFORCE_NOT_NULL(hccl_id,
-                          platform::errors::InvalidArgument(
-                              "The hccl unique id should not be null."));
-  PADDLE_ENFORCE_GT(
-      nranks,
-      1,
-      platform::errors::InvalidArgument(
-          "Expected nranks > 1. But received nranks is %d.", nranks));
-  PADDLE_ENFORCE_GE(rank,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "Expected rank >= 0. But received rank is %d.", rank));
-  PADDLE_ENFORCE_LT(
-      rank,
-      nranks,
-      platform::errors::InvalidArgument(
-          "Expected rank < nranks. But received rank is %d, nranks is %d.",
-          rank,
-          nranks));
-  PADDLE_ENFORCE_GE(
-      dev_id,
-      0,
-      platform::errors::InvalidArgument(
-          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
-
-  HcclComm comm;
-  SetNPUDeviceId(dev_id);
-  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
-          << ", hccl_id: " << hccl_id << ", rank: " << rank;
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
-
-  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
-          << ", hccl_id: " << hccl_id << ", rank: " << rank;
-
-  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
-
-  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
-          << " has been created on device " << dev_id
-          << ", with comm: " << comm_wrapper->comm();
-
-  std::call_once(once_flag_, []() {
-    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
-  });
-
-  return comm_wrapper;
-}
-
-HCCLComm* HCCLCommContext::AssignHCCLComm(
-    HcclComm comm, int nranks, int rank, int dev_id, int ring_id) {
-  std::unique_ptr<NPUDeviceContext> dev_ctx(
-      new NPUDeviceContext(NPUPlace(dev_id)));
-
-  HCCLCommImpl* c = new HCCLCommImpl;
-  c->set_ring_id(ring_id);
-  c->set_nranks(nranks);
-  c->set_rank(rank);
-  c->set_comm(comm);
-  c->set_dev_ctx(std::move(dev_ctx));
-
-  comm_map_mutex_.lock();
-  if (comm_map_.count(ring_id) == 0) {
-    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
-  }
-  auto& dev2comm = comm_map_[ring_id];
-
-  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
-  comm_map_mutex_.unlock();
-
-  if (ring_id == 0) {
-    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(
-            platform::NPUPlace(dev_id)));
-    dev_ctx->set_hccl_comm(comm);
-  }
-
-  return comm_map_[ring_id][dev_id].get();
-}
-
-void HCCLCommContext::ReleaseHCCLComms() {
-  for (auto& p : comm_map_) {
-    for (auto& q : p.second) {
-      q.second.reset();
-    }
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_uint64(gpu_memory_limit_mb);
-DECLARE_string(selected_npus);
-DECLARE_string(npu_config_path);
-
-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
-USE_NPU_MEM_STAT;
-
-namespace paddle {
-namespace platform {
-
-static int GetNPUDeviceCountImpl() {
-  uint32_t count;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
-  return count;
-}
-
-int GetNPUDeviceCount() {
-  static auto dev_cnt = GetNPUDeviceCountImpl();
-  return dev_cnt;
-}
-
-int NPUCanAccessPeer(int src, int dst) {
-  int can = 0;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
-  return can;
-}
-
-// For example, "1.0.1"
-std::string GetNPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetNPUDeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than NPU count, "
-                        "but received id is: %d. NPU count is: %d.",
-                        id,
-                        GetNPUDeviceCount()));
-  int major = 0, minor = 0, patch = 0;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
-  return string::Sprintf("%d.%d.%d", major, minor, patch);
-}
-
-int GetCurrentNPUDeviceId() {
-  int device_id;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
-  return device_id;
-}
-
-void GetCurrentNPUContext(aclrtContext *context) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(context));
-}
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedNPUDevices() {
-  // use user specified NPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_npus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetNPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
-
-void SetNPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetNPUDeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than NPU count, "
-                        "but received id is: %d. NPU count is: %d.",
-                        id,
-                        GetNPUDeviceCount()));
-  // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
-  // pairly.
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
-}
-
-void ResetNPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetNPUDeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than NPU count, "
-                        "but received id is: %d. NPU count is: %d.",
-                        id,
-                        GetNPUDeviceCount()));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
-}
-
-void NPUMemoryUsage(size_t *available, size_t *total) {
-  size_t actual_available, actual_total;
-  RecordedNPUMemGetInfo(available,
-                        total,
-                        &actual_available,
-                        &actual_total,
-                        platform::GetCurrentNPUDeviceId());
-}
-
-size_t NPUAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  NPUMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = NPUMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
-}
-
-size_t NPUMaxAllocSize() {
-  return std::max(NPUInitAllocSize(), NPUReallocSize());
-}
-
-static size_t NPUAllocSize(bool realloc) {
-  size_t available_to_alloc = NPUAvailableMemToAlloc();
-  PADDLE_ENFORCE_GT(
-      available_to_alloc,
-      0,
-      platform::errors::ResourceExhausted("Not enough available NPU memory."));
-  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
-  // allocated by fraction
-  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
-                           : FLAGS_initial_gpu_memory_in_mb;
-  size_t alloc_bytes =
-      (flag_mb > 0ul
-           ? flag_mb << 20
-           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(
-      available_to_alloc,
-      alloc_bytes,
-      platform::errors::ResourceExhausted("Not enough available NPU memory."));
-  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
-           << " MiB, is it Re-alloc: " << realloc;
-  return alloc_bytes;
-}
-
-size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
-
-size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
-
-size_t NPUMaxChunkSize() {
-  size_t max_chunk_size = NPUMaxAllocSize();
-  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
-  return max_chunk_size;
-}
-
-void NPUMemcpyAsync(void *dst,
-                    const void *src,
-                    size_t count,
-                    enum aclrtMemcpyKind kind,
-                    aclrtStream stream,
-                    size_t dst_max_count) {
-  dst_max_count = dst_max_count ? dst_max_count : count;
-  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
-          << kind << " " << stream;
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
-}
-
-void NPUMemcpySync(void *dst,
-                   const void *src,
-                   size_t count,
-                   enum aclrtMemcpyKind kind,
-                   size_t dst_max_count) {
-  // NOTE(zhiqiu):  The default max_count is count
-  dst_max_count = dst_max_count ? dst_max_count : count;
-  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
-          << kind;
-  if (dst == nullptr && dst_max_count == 0) {
-    VLOG(4) << "Dot not call aclrtMemcpy for zero_size_allocation on NPU";
-    return;
-  }
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
-}
-
-void NPUMemcpyPeerASync(void *dst,
-                        int dst_device,
-                        const void *src,
-                        size_t count,
-                        enum aclrtMemcpyKind kind,
-                        aclrtStream stream,
-                        size_t dst_max_count) {
-  dst_max_count = dst_max_count ? dst_max_count : count;
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
-}
-
-void NPUMemcpyPeerSync(void *dst,
-                       int dst_device,
-                       const void *src,
-                       size_t count,
-                       enum aclrtMemcpyKind kind,
-                       size_t dst_max_count) {
-  // NOTE(zhiqiu):  The default max_count is count
-  dst_max_count = dst_max_count ? dst_max_count : count;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
-}
-
-void NPUMemsetSync(void *dst, int value, size_t count, size_t max_count) {
-  max_count = max_count ? max_count : count;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemset(dst, max_count, value, count));
-}
-
-void NPUMemsetAsync(
-    void *dst, int value, size_t count, aclrtStream stream, size_t max_count) {
-  max_count = max_count ? max_count : count;
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclrtMemsetAsync(dst, max_count, value, count, stream));
-}
-
-void NPUStreamCreate(aclrtStream *stream) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(stream));
-}
-
-void NPUStreamSync(aclrtStream stream) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
-}
-
-void NPUStreamDestroy(aclrtStream stream) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream));
-}
-
-void NPUEventCreate(aclrtEvent *event) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(event));
-}
-
-void NPUEventDestroy(aclrtEvent event) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
-}
-
-void NPUEventRecord(aclrtEvent event, aclrtStream stream) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
-}
-
-void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status));
-}
-
-void NPUEventSynchronize(aclrtEvent event) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event));
-}
-
-void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event));
-}
-
-static void RaiseNonOutOfMemoryError(aclError *status) {
-  if (*status == ACL_ERROR_BAD_ALLOC) {
-    *status = ACL_ERROR_NONE;
-  }
-  PADDLE_ENFORCE_NPU_SUCCESS(*status);
-}
-
-class RecordedNPUMallocHelper {
- private:
-  explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
-      : dev_id_(dev_id), limit_size_(limit_size) {
-    if (NeedRecord()) {
-      mtx_.reset(new std::mutex());
-    }
-  }
-
-  DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
-
- public:
-  static RecordedNPUMallocHelper *Instance(int dev_id) {
-    std::call_once(once_flag_, [] {
-      int dev_cnt = GetNPUDeviceCount();
-      instances_.reserve(dev_cnt);
-      for (int i = 0; i < dev_cnt; ++i) {
-        // NOTE(zhiqiu): share the flags with gpu, avoid more flags.
-        instances_.emplace_back(
-            new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
-      }
-    });
-
-    PADDLE_ENFORCE_GE(
-        dev_id,
-        0,
-        platform::errors::OutOfRange(
-            "Device id must be not less than 0, but got %d.", dev_id));
-    PADDLE_ENFORCE_LT(
-        dev_id,
-        instances_.size(),
-        platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
-                                     dev_id,
-                                     instances_.size()));
-    return instances_[dev_id].get();
-  }
-
-  /**
-   * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
-   * or ACL_ERROR_NONE would be returned.
-   */
-  aclError Malloc(void **ptr, size_t size) {
-    LockGuardPtr<std::mutex> lock(mtx_);
-    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
-      return ACL_ERROR_BAD_ALLOC;
-    }
-
-    NPUDeviceGuard guard(dev_id_);
-    auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
-    if (result == ACL_ERROR_NONE) {
-      if (NeedRecord()) {
-        cur_size_ += size;
-      }
-      STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
-      return result;
-    } else {
-      RaiseNonOutOfMemoryError(&result);
-      // Non out of memory error would be raised inside
-      // RaiseNonOutOfMemoryError. Therefore, we can
-      // return cudaErrorMemoryAllocation directly here.
-      return ACL_ERROR_BAD_ALLOC;
-    }
-  }
-
-  /**
-   * Free gpu memory. Usually, free is not allowed to raise error.
-   * If it does raise error, the process should be crashed.
-   */
-  void Free(void *ptr, size_t size) {
-    NPUDeviceGuard guard(dev_id_);
-    auto result = aclrtFree(ptr);
-    PADDLE_ENFORCE_NPU_SUCCESS(result);
-    if (NeedRecord()) {
-      std::lock_guard<std::mutex> guard(*mtx_);
-      cur_size_ -= size;
-    }
-    STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
-  }
-
-  bool GetMemInfo(size_t *avail,
-                  size_t *total,
-                  size_t *actual_avail,
-                  size_t *actual_total) {
-    {
-      NPUDeviceGuard guard(dev_id_);
-      auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
-      if (result != ACL_ERROR_NONE) {
-        *actual_avail = 0;
-      }
-      RaiseNonOutOfMemoryError(&result);
-    }
-
-    if (NeedRecord()) {
-      std::lock_guard<std::mutex> guard(*mtx_);
-      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
-      *total = std::min(*actual_total, limit_size_);
-      return *total < *actual_total;
-    } else {
-      *avail = *actual_avail;
-      *total = *actual_total;
-      return false;
-    }
-  }
-
-  inline bool NeedRecord() const { return limit_size_ != 0; }
-
-  uint64_t RecordedSize() const {
-    LockGuardPtr<std::mutex> lock(mtx_);
-    return NeedRecord() ? cur_size_ : 0;
-  }
-
-  uint64_t LimitSize() const { return limit_size_; }
-
- private:
-  const int dev_id_;
-  const uint64_t limit_size_;
-  uint64_t cur_size_{0};
-
-  mutable std::unique_ptr<std::mutex> mtx_;
-
-  static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
-};
-
-std::once_flag RecordedNPUMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
-    RecordedNPUMallocHelper::instances_;
-
-aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
-  return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
-}
-
-void RecordedNPUFree(void *p, size_t size, int dev_id) {
-  return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
-}
-
-bool RecordedNPUMemGetInfo(size_t *avail,
-                           size_t *total,
-                           size_t *actual_avail,
-                           size_t *actual_total,
-                           int dev_id) {
-  return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
-      avail, total, actual_avail, actual_total);
-}
-
-uint64_t RecordedNPUMallocSize(int dev_id) {
-  return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
-}
-
-bool IsNPUMallocRecorded(int dev_id) {
-  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
-}
-
-aclError NPUHostMalloc(void **ptr, size_t size) {
-  return aclrtMallocHost(ptr, size);
-}
-
-aclError NPUHostFree(void *ptr) { return aclrtFreeHost(ptr); }
-
-void NPULaunchCallback(aclrtCallback fn,
-                       void *userData,
-                       aclrtCallbackBlockType blockType,
-                       aclrtStream stream) {
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclrtLaunchCallback(fn, userData, blockType, stream));
-}
-
-AclInstance::~AclInstance() {}
-
-AclInstance &AclInstance::Instance() {
-  static AclInstance instance;
-  return instance;
-}
-
-AclInstance::AclInstance() {
-  if (!FLAGS_npu_config_path.empty()) {
-    VLOG(4) << "Call aclInit(" << FLAGS_npu_config_path << ") ";
-    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(FLAGS_npu_config_path.c_str()));
-  } else {
-    VLOG(4) << "Call aclInit(nullptr) ";
-    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
-  }
-
-  VLOG(4) << "Call aclrtSetDevice ";
-  // NOTE(zhiqiu): why set devices here?
-  // Because ACL creates a default context which contains 2 streams
-  // when calling aclrtSetDeviceId, so usually we do not need to
-  // create contexts explicitly. And, for each device, aclrtSetDeviceId
-  // need to call parily with aclrtResetDeviceId to destory the default
-  // context. Here, we use this singleton and static instance to manage
-  // the devices to make sure they will be resetted before program exit.
-  devices_ = platform::GetSelectedNPUDevices();
-  for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
-    SetNPUDeviceId(*it);
-    VLOG(4) << "Call aclrtSetDevice " << *it;
-  }
-}
-
-void AclInstance::Finalize() {
-  // NOTE(zhiqiu): DO NOT perform finalize in destructor
-  // to avoid problems caused by destructor order of static
-  // object.
-  for (size_t i = 0; i < devices_.size(); ++i) {
-    auto status = aclrtResetDevice(devices_[i]);
-    VLOG(4) << "Call aclrtResetDevice " << devices_[i]
-            << " status = " << status;
-  }
-  auto status = aclFinalize();
-  VLOG(4) << "Call aclFinalize, status = " << status;
-}
-
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <stddef.h>
-
-#include <string>
-#include <vector>
-
-#include "acl/acl.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-#include "paddle/phi/backends/npu/npu_info.h"
-
-namespace paddle {
-namespace platform {
-
-//! Get the total number of NPU devices in system.
-int GetNPUDeviceCount();
-
-//! Get the runtime version of the ith NPU
-std::string GetNPURuntimeVersion(int id);
-
-//! Check if this device can access peer or not.
-int NPUCanAccessPeer(int src, int dst);
-
-//! Get the current NPU device id in system.
-int GetCurrentNPUDeviceId();
-
-//! Get the current NPU context.
-void GetCurrentNPUContext(aclrtContext *context);
-
-//! Get the current NPU stream.
-int GetCurrentStream();
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedNPUDevices();
-
-//! Set the NPU device id for next execution.
-void SetNPUDeviceId(int device_id);
-
-//! Reset the NPU device id for next execution.
-void ResetNPUDeviceId(int device_id);
-
-//! Get the memory usage of current NPU device.
-void NPUMemoryUsage(size_t *available, size_t *total);
-
-//! Get the available memory to allocate, which is the size of available npu
-//! minus reserving.
-size_t NPUAvailableMemToAlloc();
-
-//! Get the maximum allocation size of current NPU device.
-size_t NPUMaxAllocSize();
-
-//! Get the initial allocation size of current NPU device.
-size_t NPUInitAllocSize();
-
-//! Get the re-allocation size of current NPU device.
-size_t NPUReallocSize();
-
-using phi::backends::npu::NPUMinChunkSize;
-
-//! Get the maximum chunk size for NPU buddy allocator.
-size_t NPUMaxChunkSize();
-
-//! Copy memory from address src to dst asynchronously.
-void NPUMemcpyAsync(void *dst,
-                    const void *src,
-                    size_t count,
-                    enum aclrtMemcpyKind kind,
-                    aclrtStream stream,
-                    size_t dst_max_count = 0);
-
-//! Copy memory from address src to dst synchronously.
-void NPUMemcpySync(void *dst,
-                   const void *src,
-                   size_t count,
-                   enum aclrtMemcpyKind kind,
-                   size_t dst_max_count = 0);
-
-//! Set memory dst with value count size synchronously.
-void NPUMemsetSync(void *dst, int value, size_t count, size_t max_count = 0);
-
-//! Set memory dst with value count size asynchronously
-void NPUMemsetAsync(void *dst,
-                    int value,
-                    size_t count,
-                    aclrtStream stream,
-                    size_t max_count = 0);
-
-//! Copy memory from one device to another device asynchronously.
-void NPUMemcpyPeerAsync(void *dst,
-                        int dst_device,
-                        const void *src,
-                        int src_device,
-                        size_t count,
-                        aclrtStream stream,
-                        size_t max_count = 0);
-
-//! Copy memory from one device to another device synchronously.
-void NPUMemcpyPeerSync(void *dst,
-                       int dst_device,
-                       const void *src,
-                       int src_device,
-                       size_t count,
-                       size_t max_count = 0);
-
-//! Create NPU stream.
-void NPUStreamCreate(aclrtStream *stream);
-
-//! Blocks until stream has completed all operations.
-void NPUStreamSync(aclrtStream stream);
-
-//! Destroy NPU stream.
-void NPUStreamDestroy(aclrtStream stream);
-
-//! Create NPU Event.
-void NPUEventCreate(aclrtEvent *event);
-
-//! Destroy NPU Event.
-void NPUEventDestroy(aclrtEvent event);
-
-//! Query NPU event status.
-void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status);
-
-//! Record NPU event in the stream.
-void NPUEventRecord(aclrtEvent event, aclrtStream stream);
-
-//! Synchronize NPU event.
-void NPUEventSynchronize(aclrtEvent event);
-
-//! Makes a stream wait on an event.
-void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event);
-
-//! Alloc host or device memory.
-aclError NPUHostMalloc(void **ptr, size_t size);
-
-//! Frees host or device memory.
-aclError NPUHostFree(void *ptr);
-
-//! aclrtMalloc with recorded info
-aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
-
-//! aclrtFree with recorded info
-void RecordedNPUFree(void *p, size_t size, int dev_id);
-
-//! Get available and total gpu memory with considering limitation
-bool RecordedNPUMemGetInfo(size_t *avail,
-                           size_t *total,
-                           size_t *actual_avail,
-                           size_t *actual_total,
-                           int dev_id);
-
-//! Get recorded actrtMalloc size. If record is disabled, return 0.
-uint64_t RecordedNPUMallocSize(int dev_id);
-
-bool IsNPUMallocRecorded(int dev_id);
-
-//! Adds a callback function executed on the host or device to the stream.
-void NPULaunchCallback(aclrtCallback fn,
-                       void *userData,
-                       aclrtCallbackBlockType blockType,
-                       aclrtStream stream);
-
-class NPUDeviceGuard {
- public:
-  explicit inline NPUDeviceGuard(int dev_id) {
-    int prev_id = platform::GetCurrentNPUDeviceId();
-    if (prev_id != dev_id) {
-      prev_id_ = prev_id;
-      platform::SetNPUDeviceId(dev_id);
-    }
-  }
-
-  inline ~NPUDeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetNPUDeviceId(prev_id_);
-    }
-  }
-
-  NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
-  NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
-
- private:
-  int prev_id_{-1};
-};
-
-class AclInstance {
- public:
-  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
-  // no PADDLE_ENFORCE here, call acl API directly.
-  ~AclInstance();
-  AclInstance(const AclInstance &o) = delete;
-  const AclInstance &operator=(const AclInstance &o) = delete;
-  static AclInstance &Instance();
-  void Finalize();
-
- private:
-  // forbid calling default constructor
-  AclInstance();
-  std::vector<int> devices_;
-};
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
-#include <paddle/fluid/framework/data_type.h>
-#include <paddle/fluid/framework/operator.h>
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "acl/acl.h"
-#include "acl/acl_op_compiler.h"
-#include "paddle/fluid/framework/framework.pb.h"
-
-DECLARE_string(npu_precision_mode);
-
-namespace paddle {
-namespace operators {
-
-static std::map<framework::proto::VarType::Type, aclDataType>
-    DTYPE_2_ACL_DTYPE = {
-        {framework::proto::VarType::BOOL, ACL_BOOL},
-        {framework::proto::VarType::UINT8, ACL_UINT8},
-        {framework::proto::VarType::INT8, ACL_INT8},
-        {framework::proto::VarType::INT16, ACL_INT16},
-        {framework::proto::VarType::INT32, ACL_INT32},
-        {framework::proto::VarType::INT64, ACL_INT64},
-        {framework::proto::VarType::FP16, ACL_FLOAT16},
-        {framework::proto::VarType::FP32, ACL_FLOAT},
-        {framework::proto::VarType::FP64, ACL_DOUBLE},
-};
-
-static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
-    {DataLayout::kNCHW, ACL_FORMAT_NCHW},
-    {DataLayout::kNHWC, ACL_FORMAT_NHWC},
-    {DataLayout::kNCDHW, ACL_FORMAT_NCDHW},
-    {DataLayout::kNDHWC, ACL_FORMAT_NDHWC},
-    {DataLayout::kAnyLayout, ACL_FORMAT_ND},
-};
-
-aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
-  auto iter = DTYPE_2_ACL_DTYPE.find(dtype);
-  PADDLE_ENFORCE_NE(iter,
-                    DTYPE_2_ACL_DTYPE.end(),
-                    platform::errors::NotFound(
-                        "The data type (%s) can not convert to ACL data type.",
-                        framework::DataTypeToString(dtype)));
-  return iter->second;
-}
-
-aclFormat ConvertToNpuFormat(DataLayout layout) {
-  auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
-  PADDLE_ENFORCE_NE(
-      iter,
-      DATA_LAYOUT_2_ACL_FORMAT.end(),
-      platform::errors::NotFound(
-          "The data type (%s) can not convert to ACL data type.", layout));
-  return iter->second;
-}
-
-aclrtStream GetCurrentNPUStream(int device_id) {
-  if (device_id == -1) {
-    device_id = platform::GetCurrentNPUDeviceId();
-  }
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
-      pool.Get(platform::NPUPlace(device_id)));
-  return dev_ctx->stream();
-}
-
-NpuOpRunner::NpuOpRunner() {}
-
-NpuOpRunner::NpuOpRunner(const std::string &op_type) : op_type_(op_type) {}
-
-NpuOpRunner::NpuOpRunner(const std::string &op_type,
-                         const std::vector<Tensor> &inputs,
-                         const std::vector<Tensor> &outputs,
-                         const NPUAttributeMap &attrs)
-    : op_type_(op_type) {
-  AddInputs(inputs);
-  AddOutputs(outputs);
-  AddAttrs(attrs);
-}
-
-NpuOpRunner::~NpuOpRunner() {
-  VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
-  // Is it safe to free the descs/buffers after run called in host ?
-  aclopDestroyAttr(attr_);  // return void
-  for (auto desc : input_descs_) {
-    aclDestroyTensorDesc(desc);
-  }
-  for (auto desc : output_descs_) {
-    aclDestroyTensorDesc(desc);
-  }
-  for (auto buffer : input_buffers_) {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
-  }
-  for (auto buffer : output_buffers_) {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
-  }
-}
-
-const std::string &NpuOpRunner::Type() { return op_type_; }
-
-NpuOpRunner &NpuOpRunner::SetType(const std::string &name) {
-  op_type_ = name;
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
-                                  const NPUAttribute &attr) {
-  if (!attr_) {
-    attr_ = aclopCreateAttr();
-  }
-  if (attr.type() == typeid(bool)) {
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrBool(attr_, name.c_str(), PADDLE_GET_CONST(bool, attr)));
-  } else if (attr.type() == typeid(int)) {
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrInt(attr_, name.c_str(), PADDLE_GET_CONST(int, attr)));
-
-  } else if (attr.type() == typeid(int64_t)) {
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrInt(attr_, name.c_str(), PADDLE_GET_CONST(int64_t, attr)));
-  } else if (attr.type() == typeid(float)) {
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrFloat(attr_, name.c_str(), PADDLE_GET_CONST(float, attr)));
-  } else if (attr.type() == typeid(std::vector<bool>)) {
-    auto a = PADDLE_GET_CONST(std::vector<bool>, attr);
-    std::vector<uint8_t> cast_a;
-    for (auto it : a) {
-      cast_a.push_back(static_cast<uint8_t>(it));
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
-        attr_, name.c_str(), cast_a.size(), cast_a.data()));
-  } else if (attr.type() == typeid(std::vector<int>)) {
-    auto a = PADDLE_GET_CONST(std::vector<int>, attr);
-    std::vector<int64_t> cast_a;
-    for (auto it : a) {
-      cast_a.push_back(static_cast<int64_t>(it));
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrListInt(attr_, name.c_str(), cast_a.size(), cast_a.data()));
-  } else if (attr.type() == typeid(std::vector<int64_t>)) {
-    auto a = PADDLE_GET_CONST(std::vector<int64_t>, attr);
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrListInt(attr_, name.c_str(), a.size(), a.data()));
-  } else if (attr.type() == typeid(std::vector<float>)) {
-    auto a = PADDLE_GET_CONST(std::vector<float>, attr);
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrListFloat(attr_, name.c_str(), a.size(), a.data()));
-  } else if (attr.type() == typeid(std::string)) {
-    auto a = PADDLE_GET_CONST(std::string, attr);
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrString(attr_, name.c_str(), a.c_str()));
-  } else if (attr.type() == typeid(std::vector<std::string>)) {
-    auto a = PADDLE_GET_CONST(std::vector<std::string>, attr);
-    std::vector<const char *> s;
-    for (auto &it : a) {
-      s.push_back(it.data());
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data()));
-  } else if (attr.type() == typeid(std::vector<std::vector<int64_t>>)) {
-    auto a = PADDLE_GET_CONST(std::vector<std::vector<int64_t>>, attr);
-    std::vector<int64_t *> data;
-    std::vector<int> num;
-    for (auto &&v : a) {
-      data.push_back(v.data());
-      num.push_back(v.size());
-    }
-    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListListInt(
-        attr_, name.c_str(), data.size(), num.data(), data.data()));
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Can not convert attribubte '%s' to convert to aclopAttr", name));
-  }
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddAttrDataType(const std::string &name,
-                                          const NPUAttribute &attr) {
-  PADDLE_ENFORCE_EQ(
-      (attr.type() == typeid(int)),
-      true,
-      platform::errors::InvalidArgument(
-          "Attr type is NOT equal to framework::proto::VarType::Type."));
-  if (!attr_) {
-    attr_ = aclopCreateAttr();
-  }
-  auto dtype = ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(
-      PADDLE_GET_CONST(int, attr)));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrDataType(attr_, name.c_str(), dtype));
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) {
-  for (const auto &pair : attrs) {
-    AddAttr(pair.first, pair.second);
-  }
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(tensor));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(tensor));
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) {
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(tensor));
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(std::vector<int32_t> &&dims) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx =
-      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
-  Tensor host_tensor;
-  paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
-  host_tensors_.emplace_back(host_tensor);
-
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
-
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx =
-      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
-  Tensor host_tensor;
-  paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
-  host_tensors_.emplace_back(host_tensor);
-
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
-
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(std::vector<float> &&values) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx =
-      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
-  Tensor host_tensor;
-  paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
-  host_tensors_.emplace_back(host_tensor);
-
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
-
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInput(std::vector<double> &&values) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx =
-      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
-  Tensor host_tensor;
-  paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
-  host_tensors_.emplace_back(host_tensor);
-
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
-
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
-  // create aclTensorDesc
-  output_descs_.emplace_back(CreateTensorDesc(tensor));
-  // create aclDataBuffer
-  output_buffers_.emplace_back(CreateDataBuffer(tensor));
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
-  input_descs_.reserve(tensors.size());
-  input_buffers_.reserve(tensors.size());
-  for (auto tensor : tensors) {
-    // create aclTensorDesc
-    input_descs_.emplace_back(CreateTensorDesc(tensor));
-    // create aclDataBuffer
-    input_buffers_.emplace_back(CreateDataBuffer(tensor));
-  }
-  return *this;
-}
-
-// NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
-// It is needed to set the name of each input tensor.
-NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
-  PADDLE_ENFORCE_EQ(names.size(),
-                    input_descs_.size(),
-                    platform::errors::InvalidArgument(
-                        "The size of input names should be "
-                        "equal to the size of input descs, but got the size "
-                        "of input names is %d, the size of input descs is %d.",
-                        names.size(),
-                        input_descs_.size()));
-  for (size_t i = 0; i < names.size(); ++i) {
-    aclSetTensorDescName(input_descs_[i], names[i].c_str());
-  }
-  return *this;
-}
-
-NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
-  output_descs_.reserve(tensors.size());
-  output_buffers_.reserve(tensors.size());
-  for (auto tensor : tensors) {
-    // create aclTensorDesc
-    output_descs_.emplace_back(CreateTensorDesc(tensor));
-    // create aclDataBuffer
-    output_buffers_.emplace_back(CreateDataBuffer(tensor));
-  }
-  return *this;
-}
-
-aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
-  PADDLE_ENFORCE_LT(index,
-                    input_descs_.size(),
-                    platform::errors::OutOfRange(
-                        "The index should be less than the size of inputs of "
-                        "operator %s, but got index is %d and size is %d",
-                        Type(),
-                        index,
-                        input_descs_.size()));
-  return input_descs_[index];
-}
-
-aclTensorDesc *NpuOpRunner::GetOutputDesc(size_t index) {
-  PADDLE_ENFORCE_LT(index,
-                    output_descs_.size(),
-                    platform::errors::OutOfRange(
-                        "The index should be less than the size of output of "
-                        "operator %s, but got index is %d and size is %d",
-                        Type(),
-                        index,
-                        output_descs_.size()));
-  return output_descs_[index];
-}
-
-std::vector<aclTensorDesc *> &NpuOpRunner::GetInputDescs() {
-  return input_descs_;
-}
-
-std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
-  return output_descs_;
-}
-
-std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
-  return input_buffers_;
-}
-
-std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
-  return output_buffers_;
-}
-
-aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
-                                             aclMemType mem_type) {
-  auto dtype =
-      ConvertToNpuDtype(framework::TransToProtoVarType(tensor.dtype()));
-  auto format = ConvertToNpuFormat(tensor.layout());
-  auto dims = phi::vectorize(tensor.dims());
-  int size = dims.size();
-  // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
-  // OP must be a scalar with shape[0]. At present, the shape
-  // of the `prob` Tensor of this OP is forced to be set to 0
-  // in `npu_op_runner.cc`, which needs to be optimized later.
-  if (op_type_ == "DropOutGenMask" && size == 1 && *(dims.data()) == 1) {
-    size = 0;
-  }
-
-  VLOG(4) << "NPU dtype:" << dtype << " "
-          << "rank:" << dims.size() << " dims:" << tensor.dims()
-          << " format:" << format;
-
-  auto *desc = aclCreateTensorDesc(dtype, size, dims.data(), format);
-  PADDLE_ENFORCE_NOT_NULL(
-      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageShape(desc, size, dims.data()));
-  if (mem_type == ACL_MEMTYPE_HOST) {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type));
-  }
-  return desc;
-}
-
-aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
-  void *ptr = tensor.data();
-  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
-  auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
-  PADDLE_ENFORCE_NOT_NULL(
-      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
-  return buffer;
-}
-
-void NpuOpRunner::Run(aclrtStream stream) const {
-  if (!stream) {
-    VLOG(4) << "Run with default current npu stream: " << stream;
-    stream = GetCurrentNPUStream();
-  }
-  VLOG(5) << "NpuOpRunner(" << this << ") Run:";
-  VLOG(4) << "op_type: " << op_type_;
-  VLOG(4) << "input_desc.size: " << input_descs_.size();
-  VLOG(4) << "output_desc.size: " << output_descs_.size();
-  VLOG(4) << "attr: " << attr_;
-  VLOG(4) << "stream: " << stream;
-
-  if (!FLAGS_npu_precision_mode.empty()) {
-    PADDLE_ENFORCE_NPU_SUCCESS(
-        aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str()));
-    VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode;
-  }
-
-  aclError ret = aclopCompileAndExecute(op_type_.c_str(),
-                                        input_descs_.size(),
-                                        input_descs_.data(),
-                                        input_buffers_.data(),
-                                        output_descs_.size(),
-                                        output_descs_.data(),
-                                        output_buffers_.data(),
-                                        attr_,
-                                        ACL_ENGINE_SYS,
-                                        ACL_COMPILE_SYS,
-                                        NULL,
-                                        stream);
-  VLOG(4) << "after aclopCompileAndExecute: " << ret;
-  PADDLE_ENFORCE_NPU_SUCCESS(ret);
-}
-
-void NpuOpRunner::TypeAdapter(
-    const std::vector<Tensor> &inputs,
-    const std::vector<Tensor> &outputs,
-    const NPUAttributeMap &attrs,
-    const platform::NPUDeviceContext &dev_ctx,
-    std::function<void(const std::vector<Tensor> &,
-                       const std::vector<Tensor> &,
-                       const NPUAttributeMap &,
-                       const platform::NPUDeviceContext &)> op_runner,
-    const std::vector<framework::proto::VarType::Type> &input_type,
-    const std::vector<framework::proto::VarType::Type> &output_type) {
-  PADDLE_ENFORCE_EQ(
-      inputs.size(),
-      input_type.size(),
-      platform::errors::InvalidArgument(
-          "The number of inputs must be equal to input_type.size()."));
-  PADDLE_ENFORCE_EQ(
-      outputs.size(),
-      output_type.size(),
-      platform::errors::InvalidArgument(
-          "The number of outputs must be equal to output_type.size()."));
-
-  std::vector<Tensor> tmp_inputs(inputs.size());
-  std::vector<Tensor> tmp_outputs(outputs.size());
-
-  for (size_t i = 0; i < input_type.size(); ++i) {
-    bool cast_input =
-        (input_type[i] == -1 ||
-         input_type[i] != framework::TransToProtoVarType(inputs[i].dtype()));
-    if (!cast_input) {
-      tmp_inputs[i].ShareDataWith(inputs[i]);
-    } else {
-      tmp_inputs[i].Resize(inputs[i].dims());
-      tmp_inputs[i].mutable_data(dev_ctx.GetPlace(),
-                                 framework::TransToPhiDataType(input_type[i]));
-
-      const auto &cast_runner = NpuOpRunner(
-          "Cast",
-          {inputs[i]},
-          {tmp_inputs[i]},
-          {{"dst_type", static_cast<int>(ConvertToNpuDtype(input_type[i]))}});
-      cast_runner.Run(dev_ctx.stream());
-    }
-  }
-  for (size_t i = 0; i < output_type.size(); ++i) {
-    bool cast_output =
-        (output_type[i] == -1 ||
-         output_type[i] != framework::TransToProtoVarType(outputs[i].dtype()));
-    if (!cast_output) {
-      tmp_outputs[i].ShareDataWith(outputs[i]);
-    } else {
-      tmp_outputs[i].Resize(outputs[i].dims());
-      tmp_outputs[i].mutable_data(
-          dev_ctx.GetPlace(), framework::TransToPhiDataType(output_type[i]));
-    }
-  }
-
-  op_runner(tmp_inputs, tmp_outputs, attrs, dev_ctx);
-
-  for (size_t i = 0; i < output_type.size(); ++i) {
-    bool cast_output =
-        (output_type[i] == -1 ||
-         output_type[i] != framework::TransToProtoVarType(outputs[i].dtype()));
-    if (cast_output) {
-      const auto &cast_runner = NpuOpRunner(
-          "Cast",
-          {tmp_outputs[i]},
-          {outputs[i]},
-          {{"dst_type",
-            static_cast<int>(ConvertToNpuDtype(
-                framework::TransToProtoVarType(outputs[i].dtype())))}});
-      cast_runner.Run(dev_ctx.stream());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#pragma once
-#include <paddle/fluid/framework/operator.h>
-#include <paddle/fluid/framework/type_defs.h>
-
-#include <string>
-#include <vector>
-
-#include "acl/acl.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-using NPUAttribute = framework::NPUAttribute;
-using NPUAttributeMap = framework::NPUAttributeMap;
-using DeviceContextPool = platform::DeviceContextPool;
-
-class NpuOpRunner {
- public:
-  NpuOpRunner();
-  explicit NpuOpRunner(const std::string &op_type);
-  NpuOpRunner(const std::string &op_type,
-              const std::vector<phi::DenseTensor> &inputs = {},
-              const std::vector<phi::DenseTensor> &outputs = {},
-              const NPUAttributeMap &attrs = {});
-
-  // NOTE(zhiqiu): why forbid copy and operator= ?
-  // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
-  // if shallow copy is performed on tensor_descs and data_buffers, it may
-  // result
-  // in use-after-free bugs.
-  NpuOpRunner(const NpuOpRunner &runner) = delete;
-  NpuOpRunner &operator=(const NpuOpRunner &runner) = delete;
-
-  ~NpuOpRunner();
-
-  const std::string &Type();
-
-  NpuOpRunner &SetType(const std::string &name);
-
-  NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr);
-
-  // NOTE(qili93): need to add indivisual api for aclopSetAttrDataType
-  // as typeid(aclDataType) and typeid(framework::proto::VarType::Type)
-  // always go to attr.type() == typeid(int) to call aclopSetAttrInt
-  NpuOpRunner &AddAttrDataType(const std::string &name,
-                               const NPUAttribute &attr);
-
-  NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs);
-
-  NpuOpRunner &AddInput(const phi::DenseTensor &tensor);
-
-  // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
-  // Specifically, the tensor of shape, tensor of dims, etc, which are small
-  // vector/list.
-  NpuOpRunner &AddInput(const phi::DenseTensor &tensor, aclMemType mem_type);
-
-  NpuOpRunner &AddInput(std::vector<int32_t> &&dims);
-
-  NpuOpRunner &AddInput(std::vector<int64_t> &&dims);
-
-  NpuOpRunner &AddInput(std::vector<float> &&values);
-
-  NpuOpRunner &AddInput(std::vector<double> &&values);
-
-  NpuOpRunner &AddOutput(const phi::DenseTensor &tensor);
-
-  NpuOpRunner &AddInputs(const std::vector<phi::DenseTensor> &tensors);
-
-  NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
-
-  NpuOpRunner &AddOutputs(const std::vector<phi::DenseTensor> &tensors);
-
-  aclTensorDesc *GetInputDesc(size_t index);
-
-  aclTensorDesc *GetOutputDesc(size_t index);
-
-  std::vector<aclTensorDesc *> &GetInputDescs();
-
-  std::vector<aclTensorDesc *> &GetOutputDescs();
-
-  std::vector<aclDataBuffer *> &GetInputBuffers();
-
-  std::vector<aclDataBuffer *> &GetOutputBuffers();
-
-  void Run(aclrtStream stream = nullptr) const;
-
-  static void TypeAdapter(
-      const std::vector<phi::DenseTensor> &inputs,
-      const std::vector<phi::DenseTensor> &outputs,
-      const NPUAttributeMap &attrs,
-      const platform::NPUDeviceContext &dev_ctx,
-      std::function<void(const std::vector<phi::DenseTensor> &,
-                         const std::vector<phi::DenseTensor> &,
-                         const NPUAttributeMap &,
-                         const platform::NPUDeviceContext &)> op_runner,
-      const std::vector<framework::proto::VarType::Type> &input_type,
-      const std::vector<framework::proto::VarType::Type> &output_type);
-
- private:
-  aclTensorDesc *CreateTensorDesc(phi::DenseTensor tensor,
-                                  aclMemType mem_type = ACL_MEMTYPE_DEVICE);
-  aclDataBuffer *CreateDataBuffer(phi::DenseTensor tensor);
-
- private:
-  std::string op_type_;
-  std::vector<aclDataBuffer *> input_buffers_;
-  std::vector<aclDataBuffer *> output_buffers_;
-  std::vector<aclTensorDesc *> input_descs_;
-  std::vector<aclTensorDesc *> output_descs_;
-  std::vector<phi::DenseTensor> host_tensors_;
-  aclopAttr *attr_{nullptr};
-};
-
-aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
-
-aclrtStream GetCurrentNPUStream(int device_id = -1);
-
-template <typename T>
-void FillNpuTensorWithConstant(phi::DenseTensor *tensor, T val) {
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(),
-      true,
-      platform::errors::InvalidArgument("The tensor should be initialized."));
-  PADDLE_ENFORCE_EQ(
-      platform::is_npu_place(tensor->place()),
-      true,
-      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
-
-  int numel = tensor->numel();
-  if (numel == 1) {
-    phi::DenseTensor npu_pinned_tensor(tensor->dtype());
-    platform::NPUPinnedPlace npu_pinned_place;
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
-    *npu_pinned_ptr = val;
-
-    memory::Copy(tensor->place(),
-                 tensor->data(),
-                 npu_pinned_place,
-                 npu_pinned_ptr,
-                 sizeof(T),
-                 GetCurrentNPUStream());
-
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation *allocation = npu_pinned_tensor.Holder().get();
-
-    npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
-  } else {
-    std::vector<T> vec(numel, static_cast<T>(val));
-    auto device_id = platform::GetCurrentNPUDeviceId();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
-        pool.Get(platform::NPUPlace(device_id)));
-
-    paddle::framework::TensorFromVector<T>(vec, *dev_ctx, tensor);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/device/npu/npu_profiler.h
+++ b/paddle/fluid/platform/device/npu/npu_profiler.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "acl/acl_prof.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-
-namespace paddle {
-namespace platform {
-
-#ifdef PADDLE_WITH_ASCEND_STRING
-// For CANN 20.2+
-// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
-// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
-// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
-// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
-// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
-constexpr aclprofAicoreMetrics default_metrics =
-    ACL_AICORE_ARITHMETIC_UTILIZATION;
-#else
-// For CANN 20.1
-// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
-// ACL_AICORE_PIPELINE = 1, record pipeline
-// ACL_AICORE_SYNCHRONIZATION = 2, record sync
-// ACL_AICORE_MEMORY = 3, recore memory
-// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
-// ACL_AICORE_STALL = 5, record pipeline ratio
-constexpr aclprofAicoreMetrics default_metrics =
-    ACL_AICORE_ARITHMATIC_THROUGHPUT;
-#endif
-
-// ACL_PROF_ACL_API, record ACL API stats
-// ACL_PROF_TASK_TIME, record AI core stats
-// ACL_PROF_AICORE_METRICS, must include
-// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
-constexpr uint64_t default_type =
-    ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
-
-aclprofConfig *NPUProfilerCreateConfig(
-    std::vector<uint32_t> devices = {},
-    aclprofAicoreMetrics metrics = default_metrics,
-    uint64_t c = default_type,
-    aclprofAicoreEvents *events = nullptr) {
-  if (devices.size() == 0) {
-    int device_id = GetCurrentNPUDeviceId();
-    devices.emplace_back(device_id);
-  }
-  aclprofConfig *config =
-      aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
-  PADDLE_ENFORCE_NOT_NULL(config,
-                          paddle::platform::errors::External(
-                              "Failed to create prof config for NPU"));
-  return config;
-}
-
-void NPUProfilerDestroyConfig(const aclprofConfig *config) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
-}
-
-void NPUProfilerInit(std::string output_path) {
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclprofInit(output_path.c_str(), output_path.size()));
-}
-
-void NPUProfilerStart(const aclprofConfig *config) {
-  if (config == nullptr) {
-    // NOTE(zhiqiu): support single device by default.
-    int device_id = GetCurrentNPUDeviceId();
-    std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
-    config = NPUProfilerCreateConfig(devices);
-  }
-  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
-}
-
-void NPUProfilerStop(const aclprofConfig *config) {
-  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
-  NPUProfilerDestroyConfig(config);
-}
-
-void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
-
-struct NPUProfConfigWrapper {
-  aclprofConfig *p_;
-  explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
-  aclprofConfig *ptr() { return p_; }
-};
-
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device/npu/npu_resource_pool.cc
+++ b/paddle/fluid/platform/device/npu/npu_resource_pool.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
-
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-
-namespace paddle {
-namespace platform {
-
-NpuStreamResourcePool::NpuStreamResourcePool() {
-  int dev_cnt = platform::GetNPUDeviceCount();
-  pool_.reserve(dev_cnt);
-  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
-    auto creator = [dev_idx] {
-      platform::SetNPUDeviceId(dev_idx);
-      aclrtStream stream;
-      NPUStreamCreate(&stream);
-      return stream;
-    };
-
-    auto deleter = [dev_idx](aclrtStream stream) {
-      platform::SetNPUDeviceId(dev_idx);
-      NPUStreamDestroy(stream);
-    };
-
-    pool_.emplace_back(ResourcePool<NpuStreamObject>::Create(creator, deleter));
-  }
-}
-
-NpuStreamResourcePool& NpuStreamResourcePool::Instance() {
-  static NpuStreamResourcePool pool;
-  return pool;
-}
-
-std::shared_ptr<NpuStreamObject> NpuStreamResourcePool::New(int dev_idx) {
-  PADDLE_ENFORCE_GE(
-      dev_idx,
-      0,
-      platform::errors::InvalidArgument(
-          "The dev_idx should be not less than 0, but got %d.", dev_idx));
-  PADDLE_ENFORCE_LT(
-      dev_idx,
-      pool_.size(),
-      platform::errors::OutOfRange(
-          "The dev_idx should be less than device count %d, but got %d.",
-          pool_.size(),
-          dev_idx));
-  return pool_[dev_idx]->New();
-}
-
-NpuEventResourcePool::NpuEventResourcePool() {
-  int dev_cnt = platform::GetNPUDeviceCount();
-  pool_.reserve(dev_cnt);
-  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
-    auto creator = [dev_idx] {
-      platform::SetNPUDeviceId(dev_idx);
-      aclrtEvent event;
-      NPUEventCreate(&event);
-      return event;
-    };
-
-    auto deleter = [dev_idx](aclrtEvent event) {
-      platform::SetNPUDeviceId(dev_idx);
-      NPUEventDestroy(event);
-    };
-
-    pool_.emplace_back(ResourcePool<NpuEventObject>::Create(creator, deleter));
-  }
-}
-
-NpuEventResourcePool& NpuEventResourcePool::Instance() {
-  static NpuEventResourcePool pool;
-  return pool;
-}
-
-std::shared_ptr<NpuEventObject> NpuEventResourcePool::New(int dev_idx) {
-  PADDLE_ENFORCE_GE(
-      dev_idx,
-      0,
-      platform::errors::InvalidArgument(
-          "The dev_idx should be not less than 0, but got %d.", dev_idx));
-  PADDLE_ENFORCE_LT(
-      dev_idx,
-      pool_.size(),
-      platform::errors::OutOfRange(
-          "The dev_idx should be less than device count %d, but got %d.",
-          pool_.size(),
-          dev_idx));
-  return pool_[dev_idx]->New();
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
--- a/paddle/fluid/platform/device/npu/npu_resource_pool.h
+++ b/paddle/fluid/platform/device/npu/npu_resource_pool.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-#include "acl/acl.h"
-#include "paddle/fluid/platform/resource_pool.h"
-
-namespace paddle {
-namespace platform {
-
-using NpuStreamObject = std::remove_pointer<aclrtStream>::type;
-using NpuEventObject = std::remove_pointer<aclrtEvent>::type;
-
-class NpuStreamResourcePool {
- public:
-  std::shared_ptr<NpuStreamObject> New(int dev_idx);
-
-  static NpuStreamResourcePool &Instance();
-
- private:
-  NpuStreamResourcePool();
-
-  DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool);
-
- private:
-  std::vector<std::shared_ptr<ResourcePool<NpuStreamObject>>> pool_;
-};
-
-class NpuEventResourcePool {
- public:
-  std::shared_ptr<NpuEventObject> New(int dev_idx);
-
-  static NpuEventResourcePool &Instance();
-
- private:
-  NpuEventResourcePool();
-
-  DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool);
-
- private:
-  std::vector<std::shared_ptr<ResourcePool<NpuEventObject>>> pool_;
-};
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
--- a/paddle/fluid/platform/device/npu/npu_stream.cc
+++ b/paddle/fluid/platform/device/npu/npu_stream.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device/npu/npu_stream.h"
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-namespace stream {
-
-bool NPUStream::Init(const Place& place) {
-  PADDLE_ENFORCE_EQ(is_npu_place(place),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "NPU stream must be created using npu place."));
-  place_ = place;
-  NPUDeviceGuard guard(place_.device);
-  NPUStreamCreate(&stream_);
-  callback_manager_.reset(new StreamCallbackManager<aclrtStream>(stream_));
-  VLOG(3) << "NPUStream Init stream: " << stream_;
-  return true;
-}
-
-void NPUStream::Destroy() {
-  NPUDeviceGuard guard(place_.device);
-  Wait();
-  WaitCallback();
-  if (stream_) {
-    NPUStreamDestroy(stream_);
-  }
-  stream_ = nullptr;
-}
-
-void NPUStream::Wait() const { NPUStreamSync(stream_); }
-
-}  // namespace stream
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device/npu/npu_stream.h
+++ b/paddle/fluid/platform/device/npu/npu_stream.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/stream_callback_manager.h"
-
-namespace paddle {
-namespace platform {
-namespace stream {
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-class NPUStream final {
- public:
-  NPUStream() = default;
-  explicit NPUStream(const Place& place) { Init(place); }
-  virtual ~NPUStream() { Destroy(); }
-
-  bool Init(const Place& place);
-
-  template <typename Callback>
-  void AddCallback(Callback&& callback) const {
-    callback_manager_->AddCallback(callback);
-  }
-
-  template <typename Callback>
-  void RecordEvent(aclrtEvent ev, Callback callback) const {
-    callback();
-    NPUEventRecord(ev, stream_);
-  }
-
-  void RecordEvent(aclrtEvent ev) const { NPUEventRecord(ev, stream_); }
-
-  void WaitEvent(aclrtEvent ev) const { NPUStreamWaitEvent(stream_, ev); }
-
-  void Wait() const;
-  void WaitCallback() const { callback_manager_->Wait(); }
-
-  aclrtStream raw_stream() const { return stream_; }
-  void Destroy();
-
- private:
-  Place place_;
-  aclrtStream stream_{nullptr};
-  std::unique_ptr<StreamCallbackManager<aclrtStream>> callback_manager_;
-
-  DISABLE_COPY_AND_ASSIGN(NPUStream);
-};
-
-#endif
-
-}  // namespace stream
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -69,8 +69,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-#include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif

 #include "paddle/phi/backends/device_ext.h"
@@ -93,7 +91,6 @@ struct GpuDevice;

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/platform/device_event_npu.cc
+++ b/paddle/fluid/platform/device_event_npu.cc
@@ -14,7 +14,6 @@

 #ifdef PADDLE_WITH_ASCEND_CL

-#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 namespace paddle {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 //
 #ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #endif

 #include "paddle/phi/common/place.h"

--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-#include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"


--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -141,7 +141,6 @@ limitations under the License. */

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif

 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -141,7 +141,6 @@ limitations under the License. */

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif

 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -156,8 +156,6 @@ limitations under the License. */

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_profiler.h"
 #endif

 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -141,7 +141,6 @@ limitations under the License. */

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif

 #ifdef PADDLE_WITH_XPU

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"