remove ASCEND* keyword (#53046)

* remove ASCEND* keyword * update docstring * bug fixed * bug fixed

remove ASCEND* keyword (#53046)
* remove ASCEND* keyword * update docstring * bug fixed * bug fixed
7fa415ca · Wang Xin · GitHub · a44d8555 · a44d8555 · a44d8555
21 changed file
--- a/paddle/fluid/operators/ascend_trigger_op.cc
+++ b/paddle/fluid/operators/ascend_trigger_op.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/ascend_trigger_op.h"
-namespace paddle {
-namespace operators {
-class AscendTriggerOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {}
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32,
-                          ctx.device_context().GetPlace());
-  }
-};
-class AscendTriggerOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("FeedList", "FeedList of Ascend SubGraph").AsDuplicable();
-    AddOutput("FetchList", "FetchList of Ascend SubGraph").AsDuplicable();
-    AddAttr<int>("graph_idx", "(int, the graph index").SetDefault(-1);
-    AddComment(R"DOC(
-Trigger Ascend SubGraph
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ascend_trigger,
-                  ops::AscendTriggerOp,
-                  ops::AscendTriggerOpMaker);
-PD_REGISTER_STRUCT_KERNEL(
-    ascend_trigger, CPU, ALL_LAYOUT, ops::AscendTriggerCPUKernel, float) {}
--- a/paddle/fluid/operators/ascend_trigger_op.h
+++ b/paddle/fluid/operators/ascend_trigger_op.h
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-template <typename T, typename DeviceContext>
-class AscendTriggerCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Please compile WITH_ASCEND option to enable ascend_trigger op"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -137,15 +137,6 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  template <typename T, typename DeviceContext>          \
  class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
-template <ReduceType red_type, typename T>
-class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
 template <ReduceType red_type, typename T>
 class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -127,15 +127,6 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
  template <typename T, typename DeviceContext>       \
  class op_name##CPUKernel : public CReduceOpCPUKernel<red_type, T> {};
-template <ReduceType red_type, typename T>
-class CReduceOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
 template <ReduceType red_type, typename T>
 class CReduceOpXPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -49,7 +49,6 @@ register_unity_group(
  cos_sim_op.cc
  crf_decoding_op.cc
  crop_op.cc
-  ascend_trigger_op.cc
  conj_op.cc
  imag_op.cc
  kldiv_loss_op.cc

--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -36,13 +36,3 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
 DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
-// For Ascend NPU
-DEFINE_INT_STATUS(STAT_npu0_mem_size)
-DEFINE_INT_STATUS(STAT_npu1_mem_size)
-DEFINE_INT_STATUS(STAT_npu2_mem_size)
-DEFINE_INT_STATUS(STAT_npu3_mem_size)
-DEFINE_INT_STATUS(STAT_npu4_mem_size)
-DEFINE_INT_STATUS(STAT_npu5_mem_size)
-DEFINE_INT_STATUS(STAT_npu6_mem_size)
-DEFINE_INT_STATUS(STAT_npu7_mem_size)
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -45,9 +45,6 @@ inline size_t Alignment(size_t size,
 #endif
    }
  }
-  if (place.GetType() == phi::AllocationType::NPU) {
-    size += 32;  // required by ascendcl
-  }
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
 }

--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1153,16 +1153,6 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                              "Predictor",
                              "Choose default funciton type in JitLayer.");
-/**
- * Custom Device NPU related FLAG
- * Name: FLAGS_npu_storage_format
- * Since Version: 2.5.0
- * Value Range: bool, default=false
- * Example:
- * Note: Enable NPU Storage Format for Ascend910 performance improvement.
- */
-PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
 /**
 * CUDNNv8 related FLAG

--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/kernels/npu_identity_kernel.h"
-#include "glog/logging.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-namespace phi {
-template <typename T, typename Context>
-void NPUIdentityKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const int format,
-                       DenseTensor* out) {
-  VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
-  out->ShareDataWith(x);
-}
-}  // namespace phi
-/** [ Why need npu_identity op? ]
- *
- * 1. Ascend CANN use internal storage format for high performance
- * computing, for example if run BatchNorm2D op with CANN internal
- * storage format ACL_FORMAT_NC1HWC0, time costs in transdata will
- * be removed, and at will gain 2x performance improvement.
- *
- * 2.The internal storage format will use storage_properties_ in
- * DenseTensor, and will change the size and layout of denser, and
- * finally it should be called when change tensor to numpy and restore
- * original size and format by calling CANN Identity OP.
- *
- * TODO(qili93): remove this op after custom op and custom device
- * integrated and then move this op along with its code to plugin.
- */
-PD_REGISTER_KERNEL(npu_identity,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::NPUIdentityKernel,
-                   float,
-                   double,
-                   int8_t,
-                   uint8_t,
-                   int16_t,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(npu_identity,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::NPUIdentityKernel,
-                   float,
-                   double,
-                   int8_t,
-                   uint8_t,
-                   int16_t,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16) {}
-#endif
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -269,11 +269,6 @@ class ShardingOptimizer(MetaOptimizerBase):
        self._gradient_merge_acc_step = gm_acc_step
        self._optimizer_sharding = optimizer_sharding
-        # this feature is design for ascend, and should NOT be used in GPU training
-        self.pp_allreduce_in_optimize = sharding_configs[
-            "pp_allreduce_in_optimize"
-        ]
    def _inner_opt_minimize(
        self, loss, startup_program, parameter_list, no_grad_set
    ):

--- a/python/paddle/distributed/transpiler/ascend_transpiler.py
+++ b/python/paddle/distributed/transpiler/ascend_transpiler.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.framework import core
-from . import collective
-OpRole = core.op_proto_and_checker_maker.OpRole
-from paddle.distributed import fleet
-class AscendTranspiler(collective.Collective):
-    def __init__(self, startup_program, main_program):
-        self.nrings = 1
-        super().__init__(self.nrings)
-        self._startup_program = startup_program
-        self._main_program = main_program
-    def _insert_allreduce_ops(self):
-        block = self._main_program.global_block()
-        ring_id = -1
-        grad = None
-        for idx, op in reversed(list(enumerate(block.ops))):
-            if (
-                self._is_backward_op(op)
-                and self.op_role_var_key in op.attr_names
-            ):
-                op_role_var = op.all_attrs()[self.op_role_var_key]
-                if len(op_role_var) == 0:
-                    continue
-                assert len(op_role_var) % 2 == 0
-                offset = idx
-                for i in range(0, len(op_role_var), 2):
-                    param = block.vars[op_role_var[i]]
-                    grad = block.vars[op_role_var[i + 1]]
-                    if param.is_distributed:
-                        continue
-                    # As we search ops reversedly, we should insert c_allreduce_sum
-                    # op in the same way to keep the ring_id alternate
-                    ring_id = (ring_id + 1) % self.nrings
-                    block._insert_op(
-                        offset + 1,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward,
-                        },
-                    )
-                    block._insert_op(
-                        offset + 2,
-                        type='scale',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'scale': 1.0 / fleet.worker_num(),
-                            self.op_role_key: OpRole.Backward,
-                        },
-                    )
-        if grad is None:
-            return
-    def transpile(self):
-        self._insert_allreduce_ops()
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -377,12 +377,6 @@ def monkey_patch_varbase():
                return None
            new_ivar = self._grad_ivar()
-            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if (
-                _global_flags()['FLAGS_npu_storage_format']
-                and 'npu' in get_all_custom_device_type()
-            ):
-                new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
            new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
            if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
                return (

--- a/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
+++ b/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import paddle
-from paddle import fluid
-class TestAscendTriggerOP(unittest.TestCase):
-    """TestCases for ascend_trigger op"""
-    def test_ascend_trigger_op(self):
-        paddle.enable_static()
-        program = fluid.Program()
-        block = program.global_block()
-        with fluid.program_guard(program):
-            x = paddle.static.data(
-                name='x', shape=[1], dtype='int64', lod_level=0
-            )
-            y = paddle.static.data(
-                name='y', shape=[1], dtype='int64', lod_level=0
-            )
-            block.append_op(
-                type="ascend_trigger",
-                inputs={"FeedList": [x]},
-                outputs={"FetchList": [y]},
-                attrs={'graph_idx': 0},
-            )
-        exe = paddle.static.Executor(paddle.CPUPlace())
-        try:
-            exe.run(program)
-        except RuntimeError as e:
-            pass
-        except:
-            self.assertTrue(False)
-        paddle.disable_static()
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
-#!/bin/bash
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-RANK_TABLE_FILE_NAME="rank_table_file.json"
-cat > ${RANK_TABLE_FILE_NAME} <<EOF
-{
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "1",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.184.23",
-                    "rank_id": "0"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.21.93",
-                    "rank_id": "1"
-                }
-            ]
-        }
-    ]
-}
-EOF
-# set ascend rank table file env
-export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
-# use ascend
-echo "begin test use ascend npu"
-distributed_args="--run_mode=collective --log_dir=testlog"
-python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
-str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
-str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
-file_0="multi_process_fleetlaunchascend.check_0.log"
-file_1="multi_process_fleetlaunchascend.check_1.log"
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
-    echo "find trainer 0"
-else
-    echo "not find trainer 0"
-    exit -1
-fi
-if grep -q "$str2" "$file_1"; then
-    echo "find trainer 1"
-else
-    echo "not find trainer 1"
-    exit -1
-fi
-# test async poll process
-if [ -f $file_0 ]; then
-    rm $file_0
-fi
-if [ -f $file_1 ]; then
-    rm $file_1
-fi
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -27,7 +27,6 @@ from .tensor import segment_sum
 from .tensor import segment_mean
 from .tensor import segment_max
 from .tensor import segment_min
-from .tensor import _npu_identity
 from .passes import fuse_resnet_unit_pass
 from . import autograd  # noqa: F401

--- a/python/paddle/incubate/tensor/__init__.py
+++ b/python/paddle/incubate/tensor/__init__.py
@@ -16,6 +16,5 @@ from .math import segment_sum
 from .math import segment_mean
 from .math import segment_max
 from .math import segment_min
-from .manipulation import _npu_identity
 __all__ = []
--- a/python/paddle/incubate/tensor/manipulation.py
+++ b/python/paddle/incubate/tensor/manipulation.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import _C_ops
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.layer_helper import LayerHelper
-__all__ = []
-# TODO(qili93): remove this op after custom op and custom device
-# integrated and then move this op along with its code to plugin.
-def _npu_identity(x, format=-1):
-    """
-    This OP takes in the Tensor :attr:`x` and change it to ouptut with
-    aclFormat with int value. This API is only used for Ascend NPU.
-    Args:
-        x(Tensor): An input N-D Tensor with data type bool, float16,
-                   float32, float64, int32, int64, int16, int8, uint8.
-        format(int): Storage data format of the output in aclFormat,
-                     default value is -1.
-    Returns:
-        Tensor: A Tensor with acl storage format on Ascend NPU.
-    Examples:
-        .. code-block:: python
-            # required: npu
-            import paddle
-            x = paddle.ones(shape=[6])
-            y = paddle.incubate._npu_identity(x, 3) # ACL_FORMAT_NC1HWC0 = 3
-            # y.shape = [1, 1, 1, 1, 16]
-    """
-    if in_dygraph_mode():
-        return _C_ops.npu_identity(x, format)
-    else:
-        check_variable_and_dtype(
-            x,
-            'x',
-            [
-                'bool',
-                'int8',
-                'uint8',
-                'int16',
-                'int32',
-                'int64',
-                'float16',
-                'float32',
-                'float64',
-            ],
-            'npu_identity',
-        )
-        helper = LayerHelper('npu_identity', **locals())
-        out = helper.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=x.stop_gradient
-        )
-        helper.append_op(
-            type='npu_identity',
-            inputs={'x': [x]},
-            outputs={'out': [out]},
-            attrs={'format': format},
-        )
-        return out
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -14,7 +14,6 @@
 from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
 from paddle.device import (
-    get_all_custom_device_type,
    is_compiled_with_cuda,
    is_compiled_with_custom_device,
    is_compiled_with_rocm,
@@ -27,7 +26,6 @@ from ...common_ops_import import Variable
 from ...device import get_cudnn_version
 from ...fluid.data_feeder import check_dtype, check_variable_and_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...framework import no_grad
 from ...tensor.manipulation import squeeze, unsqueeze
 from ...utils import (
    _contain_var,
@@ -145,16 +143,6 @@ def _conv_nd(
            new_shape = [1] * len(x.shape)
            new_shape[channel_dim] = -1
            bias = bias.reshape(new_shape)
-            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if (
-                _global_flags()['FLAGS_npu_storage_format']
-                and 'npu' in get_all_custom_device_type()
-            ):
-                with no_grad():
-                    bias_storage = _C_ops.npu_identity(
-                        bias, 3
-                    )  # ACL_FORMAT_NC1HWC0 = 3
-                    bias_storage._share_underline_tensor_to(bias)
            return _C_ops.add(pre_bias, bias)
        else:
            return pre_bias
@@ -739,16 +727,6 @@ def conv2d(
                        + bias.shape
                        + [1 for i in range(len(x.shape) - channel_dim - 1)],
                    )
-                # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-                if (
-                    _global_flags()['FLAGS_npu_storage_format']
-                    and 'npu' in get_all_custom_device_type()
-                ):
-                    with no_grad():
-                        bias_storage = _C_ops.npu_identity(
-                            bias, 3
-                        )  # ACL_FORMAT_NC1HWC0 = 3
-                        bias_storage._share_underline_tensor_to(bias)
                return _C_ops.add(pre_bias, bias)
            else:
                return pre_bias

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -33,7 +33,6 @@ import warnings
 import numpy as np
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.device import get_all_custom_device_type
 from paddle.fluid.framework import in_dygraph_mode
 from ...fluid import dygraph_utils
@@ -724,30 +723,6 @@ class _BatchNormBase(Layer):
            shape=param_shape,
        )
        self._variance.stop_gradient = True
-        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-        if (
-            _global_flags()['FLAGS_npu_storage_format']
-            and 'npu' in get_all_custom_device_type()
-        ):
-            with no_grad():
-                weight_trans = _C_ops.npu_identity(
-                    self.weight, 3
-                )  # ACL_FORMAT_NC1HWC0 = 3
-                bias_trans = _C_ops.npu_identity(
-                    self.bias, 3
-                )  # ACL_FORMAT_NC1HWC0 = 3
-                mean_trans = _C_ops.npu_identity(
-                    self._mean, 3
-                )  # ACL_FORMAT_NC1HWC0 = 3
-                var_trans = _C_ops.npu_identity(
-                    self._variance, 3
-                )  # ACL_FORMAT_NC1HWC0 = 3
-                weight_trans._share_underline_tensor_to(self.weight)
-                bias_trans._share_underline_tensor_to(self.bias)
-                mean_trans._share_underline_tensor_to(self._mean)
-                var_trans._share_underline_tensor_to(self._variance)
        self._data_format = data_format
        self._in_place = False
        self._momentum = momentum

--- a/tools/gen_ut_cmakelists.py
+++ b/tools/gen_ut_cmakelists.py
@@ -100,10 +100,8 @@ def _proccess_archs(arch):
            assert a in [
                "GPU",
                "ROCM",
-                "ASCEND",
-                "ASCEND_CL",
                "XPU",
-            ], f"""Supported arhc options are "GPU", "ROCM", "ASCEND" and "ASCEND_CL", "XPU", but the options is {a}"""
+            ], f"""Supported arhc options are "GPU", "ROCM", and "XPU", but the options is {a}"""
            archs += "WITH_" + a.upper() + " OR "
        arch = "(" + archs[:-4] + ")"
    else:

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -178,7 +178,6 @@ HIGH_PARALLEL_JOB_NEW = [
    'version_test',
    'test_broadcast_to_op',
    'test_squared_mat_sub_fuse_pass',
-    'test_fleet_ascend_utils',
    'test_fused_emb_seq_pool_op',
    'test_imperative_data_loader_exit_func',
    'test_feed_fetch_method',
@@ -460,7 +459,6 @@ HIGH_PARALLEL_JOB_NEW = [
    'program_utils_test',
    'test_fleet_distributed_strategy',
    'test_hybrid_parallel_topology',
-    'test_ascend_trigger',
    'test_fleet_rolemaker_3',
    'test_conv_activation_mkldnn_fuse_pass',
    'test_fusion_gru_bf16_mkldnn_op',
@@ -2117,7 +2115,6 @@ CPU_PARALLEL_JOB = [
    'test_analyzer_multi_model_prediction',
    'test_fleet_base_3',
    'test_fleet_base_2',
-    'test_ascend_trigger',
    'test_fleet_amp_meta_optimizer',
    'test_fleetrun',
    'dense_table_test',
@@ -3105,7 +3102,6 @@ TWO_PARALLEL_JOB = [
    'test_get_inputs_outputs_in_block',
    'test_get_device_properties',
    'test_fleet_elastic_manager',
-    'test_fleet_ascend_utils',
    'test_executor_check_fetch_list',
    'test_eig_op',
    'test_egr_performance_benchmark_fluid_cpu',